Пример #1
0
    def compute_qval(self, action):
        "Based on the current features and action coefficients, compute the current qval"

        # The current Q-Value is the dot product of the features vector and the theta
        # vector for the current action.
        #qval = 0
        #for feature in self.feature_dict.keys():
        #	value = self.feature_dict[feature]
        #	coeff = self.theta[feature][action]
        #	qval += (value * coeff)

        # TODO: need input based off of the current state
        input = self.features.make_features_list(self.features.get_features())
        bin = self.nets[action].feed_forward(input)
        qval = nn_lib.bin2cont(bin, self.NUM_BITS, self.HIGH, self.LOW)

        return qval
Пример #2
0
	def compute_qval(self, action):
		"Based on the current features and action coefficients, compute the current qval"

		# The current Q-Value is the dot product of the features vector and the theta
		# vector for the current action.
		#qval = 0
		#for feature in self.feature_dict.keys():
		#	value = self.feature_dict[feature]
		#	coeff = self.theta[feature][action]
		#	qval += (value * coeff)

		# TODO: need input based off of the current state
		input = self.features.make_features_list(self.features.get_features())
		bin = self.nets[action].feed_forward(input)
		qval = nn_lib.bin2cont(bin, self.NUM_BITS, self.HIGH, self.LOW)

		return qval
Пример #3
0
    def make_decision(self):
        "Make decision actually updates the theta values and returns the next action"

        # Get the feature values for the current state.
        cur_reward = self.get_reward()
        self.feature_dict = self.features.get_features()

        # Compute the Q-Value estimates for the current state.
        # This updates the cur_qvals dictionary.
        self.compute_qvals()

        # Get the next action according to our policy.
        greedy_action = self.get_policy_action()

        # If this is not the first move of a game and results mode is not on, then update
        # the NN for the approximation of the Q-Value function. The equation used here is
        # described in detail in our project report.
        if self.prev_action != None and not self.results_mode:
            # Make a list of the previous feature values for the input of the NN.
            prev_features_list = self.features.make_features_list(
                self.prev_features)

            # Get the Q-Value for the previous direction.
            # TODO: Should this be the Q-Value that was used in the previous iteration or from the NN
            # right now? This matters because a train() call occurs in between the compute_qvals call
            # from the previous iteration and right here.
            #last_qval = self.prev_qval
            last_qval = nn_lib.bin2cont(
                self.nets[self.prev_action].feed_forward(prev_features_list),
                self.NUM_BITS, self.HIGH, self.LOW)

            # TODO: Make a design decision about this
            #max_qval = self.cur_qvals[greedy_action]	# SARSA
            max_qval = max(self.cur_qvals.values())  # Q-Learning

            # Compute the new qval
            #new_qval = last_qval + (cur_reward[0] + (self.gamma * max_qval) - last_qval) / (self.times[self.prev_action] + 1.0)
            new_qval = cur_reward[0] + (self.gamma * max_qval)

            # Transform the qval to a bit vector
            new_qval_bv = nn_lib.cont2bin(new_qval, self.NUM_BITS, self.HIGH,
                                          self.LOW)

            # Update the buffer for the prev_action.
            if len(self.buffer[self.prev_action]) >= BUFFER_SIZE:
                self.buffer[self.prev_action].pop(0)
            self.buffer[self.prev_action].append(
                (prev_features_list, new_qval_bv))

            # Train the NN with the previous features and the new qval
            error = self.nets[self.prev_action].train(
                self.buffer[self.prev_action], LEARNING_RATE / BUFFER_SIZE)

            # Debug stuff
            #if 1.0 in prev_features_list[0:4]:
            #if True:
            if False:
                after_bv = self.nets[self.prev_action].feed_forward(
                    prev_features_list)
                after = nn_lib.bin2cont(after_bv, self.NUM_BITS, self.HIGH,
                                        self.LOW)
                print 'prev_features_list:', prev_features_list
                print 'after_bv:', after_bv
                print 'new_qval:', new_qval
                print 'last_qval:', last_qval
                print 'after:', after
                print 'error:', error
                print
                print 'cur_features_list:', self.features.make_features_list(
                    self.feature_dict)
                print 'cur_qvals:', self.cur_qvals
                print 'greedy:', greedy_action, MOVE_LOOKUP[greedy_action]
                print '-----------------------------------------'
                print
                raw_input()

            #for feature in self.prev_features.keys():
            #	if cur_reward[1] in feature:
            #		self.theta[feature][self.prev_action] += (cur_reward[0] + (self.gamma * max_qval) - last_qval) \
            #				* (self.prev_features[feature] / (self.times[self.prev_action] + 1.0))

        # Decide whether to exploit or explore to pick the action for this decision.
        randvar = random.random()
        if randvar > self.get_epsilon() or self.results_mode:
            # Exploit
            cur_action = greedy_action
        else:
            # Explore

            # Get our current direction and position
            current_direction = self.state.current_pacman_direction
            current_position = self.state.pacman_rect.topleft

            # Out of the possible directions, pick a random one that is valid.
            # If all other directions are invalid, then go backwards.
            possible_directions = list(DIRECTIONS)
            valid = False
            new_direction = 0
            while new_direction == -current_direction or not valid:
                new_direction = random.choice(possible_directions)
                valid = self.game.checker.is_valid_move(
                    'pacman', current_position, new_direction)
                possible_directions.remove(new_direction)
                if not possible_directions:
                    new_direction = -current_direction
                    break

            cur_action = new_direction

        # Update the times count for the current action.
        self.times[cur_action] += 1

        # If we are in a terminal state, then reset the state for the learning algorithm so
        # what is happening now does not propagate into the next game.
        status = self.check_terminal_state()
        if status in [DEATH, LEVEL_CLEAR]:
            self.prev_qval = None
            self.prev_action = None
            self.prev_features = {}
            self.decision_count = 0

            # Print a message to help monitor training.
            print 'finished game:', self.training_data[
                'games_count'], "level:", self.state.level_number, {
                    DEATH: 'DEATH',
                    LEVEL_CLEAR: 'LEVEL_CLEAR'
                }[status]
            if self.state.pacman_lives == 1 and status == DEATH:
                print "GAME OVER     Level:", self.state.level_number, "Score:", self.state.score
            self.training_data['games_count'] += 1

            self.compute_results(status)

        else:
            # Update the state for the next decision.
            self.prev_qval = self.cur_qvals[cur_action]
            self.prev_action = cur_action
            self.prev_features = dict(self.feature_dict)
            self.decision_count += 1

        # Return the action that was picked.
        return cur_action
Пример #4
0
	def make_decision(self):
		"Make decision actually updates the theta values and returns the next action"

		# Get the feature values for the current state.
		cur_reward = self.get_reward()
		self.feature_dict = self.features.get_features()

		# Compute the Q-Value estimates for the current state.
		# This updates the cur_qvals dictionary.
		self.compute_qvals()

		# Get the next action according to our policy.
		greedy_action = self.get_policy_action()

		# If this is not the first move of a game and results mode is not on, then update
		# the NN for the approximation of the Q-Value function. The equation used here is
		# described in detail in our project report.
		if self.prev_action != None and not self.results_mode:
			# Make a list of the previous feature values for the input of the NN.
			prev_features_list = self.features.make_features_list(self.prev_features)

			# Get the Q-Value for the previous direction.
			# TODO: Should this be the Q-Value that was used in the previous iteration or from the NN
			# right now? This matters because a train() call occurs in between the compute_qvals call
			# from the previous iteration and right here.
			#last_qval = self.prev_qval
			last_qval = nn_lib.bin2cont(self.nets[self.prev_action].feed_forward(prev_features_list), self.NUM_BITS, self.HIGH, self.LOW)

			# TODO: Make a design decision about this
			#max_qval = self.cur_qvals[greedy_action]	# SARSA
			max_qval = max(self.cur_qvals.values())	# Q-Learning

			
			# Compute the new qval
			#new_qval = last_qval + (cur_reward[0] + (self.gamma * max_qval) - last_qval) / (self.times[self.prev_action] + 1.0) 
			new_qval = cur_reward[0] + (self.gamma * max_qval)

			# Transform the qval to a bit vector
			new_qval_bv = nn_lib.cont2bin(new_qval, self.NUM_BITS, self.HIGH, self.LOW)

			
			# Update the buffer for the prev_action.
			if len(self.buffer[self.prev_action]) >= BUFFER_SIZE:
				self.buffer[self.prev_action].pop(0)
			self.buffer[self.prev_action].append((prev_features_list, new_qval_bv))

			# Train the NN with the previous features and the new qval
			error = self.nets[self.prev_action].train(self.buffer[self.prev_action], LEARNING_RATE/BUFFER_SIZE)

			# Debug stuff
			#if 1.0 in prev_features_list[0:4]:
			#if True:
			if False:
				after_bv = self.nets[self.prev_action].feed_forward(prev_features_list) 
				after = nn_lib.bin2cont(after_bv, self.NUM_BITS, self.HIGH, self.LOW)
				print 'prev_features_list:', prev_features_list
				print 'after_bv:', after_bv
				print 'new_qval:',new_qval
				print 'last_qval:',last_qval
				print 'after:', after
				print 'error:',error
				print
				print 'cur_features_list:', self.features.make_features_list(self.feature_dict)
				print 'cur_qvals:', self.cur_qvals
				print 'greedy:', greedy_action, MOVE_LOOKUP[greedy_action]
				print '-----------------------------------------'
				print
				raw_input()
			

			#for feature in self.prev_features.keys():
			#	if cur_reward[1] in feature:
			#		self.theta[feature][self.prev_action] += (cur_reward[0] + (self.gamma * max_qval) - last_qval) \
			#				* (self.prev_features[feature] / (self.times[self.prev_action] + 1.0))



		# Decide whether to exploit or explore to pick the action for this decision.
		randvar = random.random()
		if randvar > self.get_epsilon() or self.results_mode:
			# Exploit
			cur_action = greedy_action
		else:
			# Explore

			# Get our current direction and position
			current_direction = self.state.current_pacman_direction
			current_position = self.state.pacman_rect.topleft

			# Out of the possible directions, pick a random one that is valid.
			# If all other directions are invalid, then go backwards.
			possible_directions = list(DIRECTIONS)
			valid = False
			new_direction = 0
			while new_direction == -current_direction or not valid:
				new_direction = random.choice(possible_directions)
				valid = self.game.checker.is_valid_move('pacman', current_position, new_direction)
				possible_directions.remove(new_direction)
				if not possible_directions:
					new_direction = -current_direction
					break
		
			cur_action = new_direction

		# Update the times count for the current action.
		self.times[cur_action] += 1


		# If we are in a terminal state, then reset the state for the learning algorithm so
		# what is happening now does not propagate into the next game.
		status = self.check_terminal_state()
		if status in [DEATH, LEVEL_CLEAR]:
			self.prev_qval = None
			self.prev_action = None
			self.prev_features = {}
			self.decision_count = 0

			# Print a message to help monitor training.
			print 'finished game:',self.training_data['games_count'], "level:", self.state.level_number, {DEATH: 'DEATH', LEVEL_CLEAR:'LEVEL_CLEAR'}[status]
			if self.state.pacman_lives == 1 and status == DEATH:
				print "GAME OVER     Level:", self.state.level_number, "Score:", self.state.score
			self.training_data['games_count'] += 1

			self.compute_results(status)

		else:
			# Update the state for the next decision.
			self.prev_qval = self.cur_qvals[cur_action]
			self.prev_action = cur_action
			self.prev_features = dict(self.feature_dict)
			self.decision_count += 1


		# Return the action that was picked.
		return cur_action