def compute_qval(self, action): "Based on the current features and action coefficients, compute the current qval" # The current Q-Value is the dot product of the features vector and the theta # vector for the current action. #qval = 0 #for feature in self.feature_dict.keys(): # value = self.feature_dict[feature] # coeff = self.theta[feature][action] # qval += (value * coeff) # TODO: need input based off of the current state input = self.features.make_features_list(self.features.get_features()) bin = self.nets[action].feed_forward(input) qval = nn_lib.bin2cont(bin, self.NUM_BITS, self.HIGH, self.LOW) return qval
def make_decision(self): "Make decision actually updates the theta values and returns the next action" # Get the feature values for the current state. cur_reward = self.get_reward() self.feature_dict = self.features.get_features() # Compute the Q-Value estimates for the current state. # This updates the cur_qvals dictionary. self.compute_qvals() # Get the next action according to our policy. greedy_action = self.get_policy_action() # If this is not the first move of a game and results mode is not on, then update # the NN for the approximation of the Q-Value function. The equation used here is # described in detail in our project report. if self.prev_action != None and not self.results_mode: # Make a list of the previous feature values for the input of the NN. prev_features_list = self.features.make_features_list( self.prev_features) # Get the Q-Value for the previous direction. # TODO: Should this be the Q-Value that was used in the previous iteration or from the NN # right now? This matters because a train() call occurs in between the compute_qvals call # from the previous iteration and right here. #last_qval = self.prev_qval last_qval = nn_lib.bin2cont( self.nets[self.prev_action].feed_forward(prev_features_list), self.NUM_BITS, self.HIGH, self.LOW) # TODO: Make a design decision about this #max_qval = self.cur_qvals[greedy_action] # SARSA max_qval = max(self.cur_qvals.values()) # Q-Learning # Compute the new qval #new_qval = last_qval + (cur_reward[0] + (self.gamma * max_qval) - last_qval) / (self.times[self.prev_action] + 1.0) new_qval = cur_reward[0] + (self.gamma * max_qval) # Transform the qval to a bit vector new_qval_bv = nn_lib.cont2bin(new_qval, self.NUM_BITS, self.HIGH, self.LOW) # Update the buffer for the prev_action. if len(self.buffer[self.prev_action]) >= BUFFER_SIZE: self.buffer[self.prev_action].pop(0) self.buffer[self.prev_action].append( (prev_features_list, new_qval_bv)) # Train the NN with the previous features and the new qval error = self.nets[self.prev_action].train( self.buffer[self.prev_action], LEARNING_RATE / BUFFER_SIZE) # Debug stuff #if 1.0 in prev_features_list[0:4]: #if True: if False: after_bv = self.nets[self.prev_action].feed_forward( prev_features_list) after = nn_lib.bin2cont(after_bv, self.NUM_BITS, self.HIGH, self.LOW) print 'prev_features_list:', prev_features_list print 'after_bv:', after_bv print 'new_qval:', new_qval print 'last_qval:', last_qval print 'after:', after print 'error:', error print print 'cur_features_list:', self.features.make_features_list( self.feature_dict) print 'cur_qvals:', self.cur_qvals print 'greedy:', greedy_action, MOVE_LOOKUP[greedy_action] print '-----------------------------------------' print raw_input() #for feature in self.prev_features.keys(): # if cur_reward[1] in feature: # self.theta[feature][self.prev_action] += (cur_reward[0] + (self.gamma * max_qval) - last_qval) \ # * (self.prev_features[feature] / (self.times[self.prev_action] + 1.0)) # Decide whether to exploit or explore to pick the action for this decision. randvar = random.random() if randvar > self.get_epsilon() or self.results_mode: # Exploit cur_action = greedy_action else: # Explore # Get our current direction and position current_direction = self.state.current_pacman_direction current_position = self.state.pacman_rect.topleft # Out of the possible directions, pick a random one that is valid. # If all other directions are invalid, then go backwards. possible_directions = list(DIRECTIONS) valid = False new_direction = 0 while new_direction == -current_direction or not valid: new_direction = random.choice(possible_directions) valid = self.game.checker.is_valid_move( 'pacman', current_position, new_direction) possible_directions.remove(new_direction) if not possible_directions: new_direction = -current_direction break cur_action = new_direction # Update the times count for the current action. self.times[cur_action] += 1 # If we are in a terminal state, then reset the state for the learning algorithm so # what is happening now does not propagate into the next game. status = self.check_terminal_state() if status in [DEATH, LEVEL_CLEAR]: self.prev_qval = None self.prev_action = None self.prev_features = {} self.decision_count = 0 # Print a message to help monitor training. print 'finished game:', self.training_data[ 'games_count'], "level:", self.state.level_number, { DEATH: 'DEATH', LEVEL_CLEAR: 'LEVEL_CLEAR' }[status] if self.state.pacman_lives == 1 and status == DEATH: print "GAME OVER Level:", self.state.level_number, "Score:", self.state.score self.training_data['games_count'] += 1 self.compute_results(status) else: # Update the state for the next decision. self.prev_qval = self.cur_qvals[cur_action] self.prev_action = cur_action self.prev_features = dict(self.feature_dict) self.decision_count += 1 # Return the action that was picked. return cur_action
def make_decision(self): "Make decision actually updates the theta values and returns the next action" # Get the feature values for the current state. cur_reward = self.get_reward() self.feature_dict = self.features.get_features() # Compute the Q-Value estimates for the current state. # This updates the cur_qvals dictionary. self.compute_qvals() # Get the next action according to our policy. greedy_action = self.get_policy_action() # If this is not the first move of a game and results mode is not on, then update # the NN for the approximation of the Q-Value function. The equation used here is # described in detail in our project report. if self.prev_action != None and not self.results_mode: # Make a list of the previous feature values for the input of the NN. prev_features_list = self.features.make_features_list(self.prev_features) # Get the Q-Value for the previous direction. # TODO: Should this be the Q-Value that was used in the previous iteration or from the NN # right now? This matters because a train() call occurs in between the compute_qvals call # from the previous iteration and right here. #last_qval = self.prev_qval last_qval = nn_lib.bin2cont(self.nets[self.prev_action].feed_forward(prev_features_list), self.NUM_BITS, self.HIGH, self.LOW) # TODO: Make a design decision about this #max_qval = self.cur_qvals[greedy_action] # SARSA max_qval = max(self.cur_qvals.values()) # Q-Learning # Compute the new qval #new_qval = last_qval + (cur_reward[0] + (self.gamma * max_qval) - last_qval) / (self.times[self.prev_action] + 1.0) new_qval = cur_reward[0] + (self.gamma * max_qval) # Transform the qval to a bit vector new_qval_bv = nn_lib.cont2bin(new_qval, self.NUM_BITS, self.HIGH, self.LOW) # Update the buffer for the prev_action. if len(self.buffer[self.prev_action]) >= BUFFER_SIZE: self.buffer[self.prev_action].pop(0) self.buffer[self.prev_action].append((prev_features_list, new_qval_bv)) # Train the NN with the previous features and the new qval error = self.nets[self.prev_action].train(self.buffer[self.prev_action], LEARNING_RATE/BUFFER_SIZE) # Debug stuff #if 1.0 in prev_features_list[0:4]: #if True: if False: after_bv = self.nets[self.prev_action].feed_forward(prev_features_list) after = nn_lib.bin2cont(after_bv, self.NUM_BITS, self.HIGH, self.LOW) print 'prev_features_list:', prev_features_list print 'after_bv:', after_bv print 'new_qval:',new_qval print 'last_qval:',last_qval print 'after:', after print 'error:',error print print 'cur_features_list:', self.features.make_features_list(self.feature_dict) print 'cur_qvals:', self.cur_qvals print 'greedy:', greedy_action, MOVE_LOOKUP[greedy_action] print '-----------------------------------------' print raw_input() #for feature in self.prev_features.keys(): # if cur_reward[1] in feature: # self.theta[feature][self.prev_action] += (cur_reward[0] + (self.gamma * max_qval) - last_qval) \ # * (self.prev_features[feature] / (self.times[self.prev_action] + 1.0)) # Decide whether to exploit or explore to pick the action for this decision. randvar = random.random() if randvar > self.get_epsilon() or self.results_mode: # Exploit cur_action = greedy_action else: # Explore # Get our current direction and position current_direction = self.state.current_pacman_direction current_position = self.state.pacman_rect.topleft # Out of the possible directions, pick a random one that is valid. # If all other directions are invalid, then go backwards. possible_directions = list(DIRECTIONS) valid = False new_direction = 0 while new_direction == -current_direction or not valid: new_direction = random.choice(possible_directions) valid = self.game.checker.is_valid_move('pacman', current_position, new_direction) possible_directions.remove(new_direction) if not possible_directions: new_direction = -current_direction break cur_action = new_direction # Update the times count for the current action. self.times[cur_action] += 1 # If we are in a terminal state, then reset the state for the learning algorithm so # what is happening now does not propagate into the next game. status = self.check_terminal_state() if status in [DEATH, LEVEL_CLEAR]: self.prev_qval = None self.prev_action = None self.prev_features = {} self.decision_count = 0 # Print a message to help monitor training. print 'finished game:',self.training_data['games_count'], "level:", self.state.level_number, {DEATH: 'DEATH', LEVEL_CLEAR:'LEVEL_CLEAR'}[status] if self.state.pacman_lives == 1 and status == DEATH: print "GAME OVER Level:", self.state.level_number, "Score:", self.state.score self.training_data['games_count'] += 1 self.compute_results(status) else: # Update the state for the next decision. self.prev_qval = self.cur_qvals[cur_action] self.prev_action = cur_action self.prev_features = dict(self.feature_dict) self.decision_count += 1 # Return the action that was picked. return cur_action