예제 #1
0
파일: qlearner.py 프로젝트: zilehuda/CIS521
    def do_turn(self):
        """
        do_turn just does some bookkeeping and calls the update+explore/exploit 
        loop for each living or just killed ant.  You shouldn't need to modify 
        this function.
        """

        # Grid lookup resolution: size 10 squares
        if self.state == None:
            self.state = GlobalState(self.world, resolution=10)
        else:
            self.state.update()

        # explore or exploit and update values for every ant that's alive or was just killed
        for ant in self.world.ants:
            if ant.status == AntStatus.ALIVE or ant.previous_reward_events.was_killed:
                ant.direction = self.explore_and_exploit(ant)

        self.avoid_collisions()

        # record features for action taken so we can update when we arrive in the next state next turn
        for ant in self.world.ants:
            ant.prev_features = self.features.extract(self.world, self.state,
                                                      ant.location,
                                                      ant.direction)
            ant.prev_value = self.value(self.state, ant.location,
                                        ant.direction)

        print self.world.L.info(str(self))
예제 #2
0
    def do_turn(self):
        """Precomputes global state, and then chooses max value action for each ant independently."""

        # Run the routine for each living ant independently.
        next_locations = {}

        print "Dddddddddd"
        # Grid lookup resolution: size 10 squares
        if self.state == None:
            self.state = GlobalState(self.world, resolution=FOG)
        else:
            self.state.update()

        for ant in self.world.ants:
            if ant.status == AntStatus.ALIVE:
                ant.direction = self.get_direction(ant)
                if ant.direction == 'halt' or ant.direction == None:
                    ant.direction = None
                else:
                    # Basic collision detection: don't land on the same square as another friendly ant.
                    nextpos = self.world.next_position(ant.location,
                                                       ant.direction)
                    if nextpos in next_locations.keys():
                        ant.direction = None
                    else:
                        next_locations[nextpos] = ant.ant_id
예제 #3
0
 def do_turn(self):
     """Precomputes global state, and then chooses max value action for each ant independently."""
     
     # Run the routine for each living ant independently.
     next_locations = {}
     
     print "Dddddddddd"
     # Grid lookup resolution: size 10 squares
     if self.state == None:
         self.state = GlobalState(self.world, resolution=FOG)
     else:
         self.state.update()
     
     for ant in self.world.ants:
         if ant.status == AntStatus.ALIVE:
             ant.direction = self.get_direction(ant)
             if ant.direction == 'halt' or ant.direction == None:
                 ant.direction = None
             else:
                 # Basic collision detection: don't land on the same square as another friendly ant.
                 nextpos = self.world.next_position(ant.location, ant.direction) 
                 if nextpos in next_locations.keys():  
                     ant.direction = None
                 else:
                     next_locations[nextpos] = ant.ant_id
예제 #4
0
파일: qlearner.py 프로젝트: lkirbyl/ANTS-
    def do_turn(self):
        """
        do_turn just does some bookkeeping and calls the update+explore/exploit 
        loop for each living or just killed ant.  You shouldn't need to modify 
        this function.
        """
        self.nturns += 1
        self.lastPercentSeen = self.percentSeen
        self.percentSeen = len([loc for loc in self.world.map if loc > -5])/self.world.width*self.world.height
        
        # Grid lookup resolution: size 10 squares
        if self.state == None:
            self.state = GlobalState(self.world, resolution=10)
        else:
            self.state.update()
            
        # explore or exploit and update values for every ant that's alive or was just killed
        for ant in self.world.ants:
            if ant.status == AntStatus.ALIVE or ant.previous_reward_events.was_killed:
                ant.direction = self.explore_and_exploit(ant)
                if ant.direction is None or ant.direction is 'halt':
                    ant.direction = None
                
        self.avoid_collisions()
        
        # record features for action taken so we can update when we arrive in the next state next turn
        for ant in self.world.ants:    
            ant.prev_features = self.features.extract(self.world, self.state, ant.location, ant.direction)
            ant.prev_value = self.value(self.state,ant.location,ant.direction)

        print self.world.L.info(str(self))
예제 #5
0
    def do_turn(self):
        """
        do_turn just does some bookkeeping and calls the update+explore/exploit 
        loop for each living or just killed ant.  You shouldn't need to modify 
        this function.
        """
        
        # Grid lookup resolution: size 10 squares
        if self.state == None:
            fog = int(math.sqrt(self.world.viewradius2)/2)
            self.state = GlobalState(self.world, visited_resolution=fog, resolution=fog)
        else:
            self.state.update()
            
        # explore or exploit and update values for every ant that's alive or was just killed
        for ant in self.world.ants:
            if ant.status == AntStatus.ALIVE or ant.previous_reward_events.was_killed:
                ant.direction = self.explore_and_exploit(ant)

        self.avoid_collisions()
        
        
        # record features for action taken so we can update when we arrive in the next state next turn
        for ant in self.world.ants:    
            ant.prev_features = self.features.extract(self.world, self.state, ant.location, ant.direction)
            ant.prev_value = self.value(self.state,ant.location,ant.direction)
예제 #6
0
파일: qlearner.py 프로젝트: zilehuda/CIS521
class QLearnBot(ValueBot):
    def __init__(self, world, load_file="save_bots/qbot.json"):
        ValueBot.__init__(self, world, load_file)
        self.ngame = 0

    def get_reward(self, reward_state):
        """ 
        Hand-tuned rewards for a state.  The RewardState reward_state tracks the
        following events for which you can tailor rewards:
            reward_state.food_eaten: Fraction of a food item this ant ate (between 0 and 1)
            reward_state.was_killed: boolean flag whether the ant died this turn
            reward_state.death_dealt: Fraction of responsibility this ant contributed to killing other ants 
			(e.g., if 2 ants killed an enemy ant, each would have death_dealt=1/2
        """
        '''
        YOUR CODE HERE
        '''
        reward = reward_state.food_eaten * .7 + reward_state.death_dealt * .4 + reward_state.was_killed * -.00001
        return reward

    def avoid_collisions(self):
        """ 
        Simple logic to avoid collisions.  DO NOT TOUCH.
        """
        next_locations = {}
        for ant in self.world.ants:
            if ant.status == AntStatus.ALIVE:
                # Basic collision detection: don't land on the same square as another friendly ant.
                nextpos = self.world.next_position(ant.location, ant.direction)
                if nextpos in next_locations.keys():
                    ant.direction = None
                else:
                    next_locations[nextpos] = ant.ant_id

    def do_turn(self):
        """
        do_turn just does some bookkeeping and calls the update+explore/exploit 
        loop for each living or just killed ant.  You shouldn't need to modify 
        this function.
        """

        # Grid lookup resolution: size 10 squares
        if self.state == None:
            self.state = GlobalState(self.world, resolution=10)
        else:
            self.state.update()

        # explore or exploit and update values for every ant that's alive or was just killed
        for ant in self.world.ants:
            if ant.status == AntStatus.ALIVE or ant.previous_reward_events.was_killed:
                ant.direction = self.explore_and_exploit(ant)

        self.avoid_collisions()

        # record features for action taken so we can update when we arrive in the next state next turn
        for ant in self.world.ants:
            ant.prev_features = self.features.extract(self.world, self.state,
                                                      ant.location,
                                                      ant.direction)
            ant.prev_value = self.value(self.state, ant.location,
                                        ant.direction)

        print self.world.L.info(str(self))

    def update_weights(self, alpha, discount, reward, maxval, prevval,
                       features):
        """
            Perform an update of the weights here according to the Q-learning
            weight update rule described in the homework handout.

            YOUR CODE HERE
        """

        for i in range(len(self.weights)):
            self.weights[i] += alpha * (reward + (discount * maxval) -
                                        prevval) * features[i]

    def explore_and_exploit(self, ant):
        '''
        Update weights and decide whether to explore or exploit here.  Where all the magic happens.
        YOUR CODE HERE
        '''

        actions = self.world.get_passable_directions(ant.location, AIM.keys())
        random.shuffle(actions)
        if len(actions) == 0:
            return 'halt'

        # if we have a newborn baby ant, init its rewards and quality fcns
        if 'prev_value' not in ant.__dict__:
            ant.prev_value = 0
            ant.previous_reward_events = RewardEvents()
            ant.prev_features = self.features.extract(self.world, self.state,
                                                      ant.location, actions[0])
            return actions[0]

        # step 1, update Q(s,a) based on going from last state, taking
        # the action issued last round, and getting to current state
        R = self.get_reward(ant.previous_reward_events)

        # step size.  it's good to make this inversely proportional to the
        # number of features, so you don't bounce out of the bowl we're trying
        # to descend via gradient descent

        alpha = 0.00001

        # totally greedy default value, future rewards count for nothing, do not want

        discount = 0.00001

        # should be max_a' Q(s',a'), where right now we are in state s' and the
        # previous state was s.  You can use
        # self.value(self.state,ant.location,action) here
        max_next_action = 'halt'
        max_next_value = self.value(self.state, ant.location, actions[0])
        for action in actions:
            value = self.value(self.state, ant.location, action)
            if value > max_next_value:
                max_next_value = value
                max_next_action = action
        # should be argmax_a' Q(s',a')

        # now that we have all the quantities needed, adjust the weights
        self.update_weights(alpha, discount, R, max_next_value, ant.prev_value,
                            ant.prev_features)

        # step 2, explore or exploit? you should replace decide_to_explore with
        # something sensible based on the number of games played so far, self.ngames

        explore = 0.7 / self.ngames
        decision = random.random()
        if explore >= decision:
            return actions[0]
        else:
            return max_next_action
예제 #7
0
class ValueBot(AntsBot):
    """ Value function based AntsBot.

    This is a template class that uses a FeatureExtractor and a set of weights to make decisions
    based on a weighted sum of features (value function.) It is capable of loading and saving to JSON using
    the FeatureExtractor.to_dict() method and FeatureExtractor(input_dict) constructor.
      
    """
    def __init__(self, world, load_file="valuebot.json"):
        """Initialize, optionally loading from file. 
        
        Note: this bot disables tracking of friendly ants in the AntWorld, 
        so that ant_id is no longer consistent between turns. This speeds up
        game speed dramatically, but means it is trickier to maintain specific ant states.
        """

        AntsBot.__init__(self, world)
        self.state = None
        self.features = None
        self.weights = None

        self.fog = world.viewradius2

        # **** NOTE: Disable ant tracking to speed up game playing.
        self.world.stateless = False

        # Try to load saved configuration from file
        if load_file is not None and os.path.exists(load_file):
            fp = file(load_file, "r")
            data = json.load(fp)
            self.set_features(FeatureExtractor(data['features']))
            self.set_weights(data['weights'])
            fp.close()

    def save(self, filename):
        """Save features and weights to file."""

        fp = file(filename, "w")
        data = {'features': self.features.to_dict(), 'weights': self.weights}
        json.dump(data, fp)
        fp.close()

    def __str__(self):
        """Print a labeled list of weight values."""

        s = 'ValueBot:\n'
        for i in range(self.features.num_features()):
            s += '\t%s = %g\n' % (self.features.feature_name(i),
                                  self.weights[i])
        return s

    def set_features(self, extractor):
        self.features = extractor
        self.world.L.debug("Setting features: %s" % str(self.features))

    def set_weights(self, weights):
        """Set weight vector. Note: checks that len(weights) == self.features.num_features()."""

        self.weights = weights
        self.world.L.debug("Setting weights: %s" % str(self.weights))
        if self.features == None or not len(
                self.weights) == self.features.num_features():
            raise AssertionError("Features need to be set before weights!")

    def value(self, state, loc, action):
        """Compute the value of a given action w.r.t. a given state and ant location."""

        feature_vector = self.features.extract(self.world, state, loc, action)

        self.world.L.info("Evaluating move: %s, %s:" % (str(loc), action))
        dot_product = 0
        for i in range(0, len(feature_vector)):
            if feature_vector[i]:
                self.world.L.info(
                    "\tf: %s = %g" %
                    (self.features.feature_name(i), self.weights[i]))
                dot_product += self.weights[i]
        self.world.L.info("\tdot_product = %g" % dot_product)

        return dot_product

    def get_direction(self, ant):
        """Evaluates each of the currently passable directions and picks the one with maximum value."""

        # get the passable directions, in random order to break ties
        rand_dirs = self.world.get_passable_directions(ant.location,
                                                       AIM.keys())
        random.shuffle(rand_dirs)

        # evaluate the value function for each possible direction
        value = [0 for i in range(0, len(rand_dirs))]
        max_value = float("-inf")
        max_dir = None
        for i in range(0, len(rand_dirs)):
            value[i] = self.value(self.state, ant.location, rand_dirs[i])
            if value[i] > max_value:
                max_value = value[i]
                max_dir = rand_dirs[i]

        # take direction with maximum value
        # Get the first passable direction from that long list.
        self.world.L.info("Chose: %s, value: %.2f" % (max_dir, max_value))
        return max_dir

    # Main logic
    def do_turn(self):
        """Precomputes global state, and then chooses max value action for each ant independently."""

        # Run the routine for each living ant independently.
        next_locations = {}

        print "Dddddddddd"
        # Grid lookup resolution: size 10 squares
        if self.state == None:
            self.state = GlobalState(self.world, resolution=FOG)
        else:
            self.state.update()

        for ant in self.world.ants:
            if ant.status == AntStatus.ALIVE:
                ant.direction = self.get_direction(ant)
                if ant.direction == 'halt' or ant.direction == None:
                    ant.direction = None
                else:
                    # Basic collision detection: don't land on the same square as another friendly ant.
                    nextpos = self.world.next_position(ant.location,
                                                       ant.direction)
                    if nextpos in next_locations.keys():
                        ant.direction = None
                    else:
                        next_locations[nextpos] = ant.ant_id

    def reset(self):
        self.state = None
예제 #8
0
class QLearnBot(ValueBot):
    def __init__(self, world, load_file="save_bots/qbot.json"):
        self.world = world
        ValueBot.__init__(self, world, load_file)
        self.nturns = 0
        self.pathfinder = None

    def get_reward(self, reward_state, ant):
        """ 
        Hand-tuned rewards for a state.  The RewardState reward_state tracks the
        following events for which you can tailor rewards:
            reward_state.food_eaten: Fraction of a food item this ant ate (between 0 and 1)
            reward_state.was_killed: boolean flag whether the ant died this turn
            reward_state.death_dealt: Fraction of responsibility this ant contributed to killing other ants (e.g., if 2 ants killed an enemy an, each would have death_dealt=1/2
            reward_state.hill_razed: 
            reward_state.hill_distance: Fraction, 1/x
        """

        if self.state.get_visited(ant.location) == 0:
            explore_bonus = 1
        else:
            explore_bonus = float(1) / (self.state.get_visited(ant.location) *
                                        100)

        print ":::::Reward Info::::"
        print "food_eaten: " + str(reward_state.food_eaten)
        print "was_killed: " + str(reward_state.was_killed)
        print "death_dealt: " + str(reward_state.death_dealt)
        print "hill_razed: " + str(reward_state.razed_hill)
        print "hill_distance: " + str(reward_state.hill_distance)
        print "friendly hill razed: " + str(reward_state.friendy_hill_razed)
        print "exploration bonus: " + str(explore_bonus)
        print "::::::::::::::::::::"

        reward = 0
        reward += LIVING_REWARD
        reward += FOOD_REWARD * reward_state.food_eaten
        reward += DEATH_REWARD * reward_state.was_killed
        reward += KILL_REWARD * reward_state.death_dealt
        reward += RAZED_REWARD * reward_state.razed_hill
        reward += HILL_DISTANCE_REWARD * reward_state.hill_distance
        reward += FRIENDLY_HILL_RAZED_REWARD * reward_state.friendy_hill_razed
        reward += EXPLORE_BONUS * explore_bonus

        return reward

    def set_pathfinder(self, pathfinder):
        self.pathfinder = pathfinder

    def avoid_collisions(self):
        """ 
        Simple logic to avoid collisions.  No need to touch this function.
        """
        next_locations = {}
        for ant in self.world.ants:
            if ant.status == AntStatus.ALIVE:
                # Basic collision detection: don't land on the same square as another friendly ant.
                nextpos = self.world.next_position(ant.location, ant.direction)
                if nextpos in next_locations.keys():
                    ant.direction = 'halt'
                else:
                    next_locations[nextpos] = ant.ant_id

    def do_turn(self):
        """
        do_turn just does some bookkeeping and calls the update+explore/exploit 
        loop for each living or just killed ant.  You shouldn't need to modify 
        this function.
        """

        # Grid lookup resolution: size 10 squares
        if self.state == None:
            fog = int(math.sqrt(self.world.viewradius2) / 2)
            self.state = GlobalState(self.world,
                                     visited_resolution=fog,
                                     resolution=fog)
        else:
            self.state.update()

        # explore or exploit and update values for every ant that's alive or was just killed
        for ant in self.world.ants:
            if ant.status == AntStatus.ALIVE or ant.previous_reward_events.was_killed:
                ant.direction = self.explore_and_exploit(ant)

        self.avoid_collisions()

        # record features for action taken so we can update when we arrive in the next state next turn
        for ant in self.world.ants:
            ant.prev_features = self.features.extract(self.world, self.state,
                                                      ant.location,
                                                      ant.direction)
            ant.prev_value = self.value(self.state, ant.location,
                                        ant.direction)

    def update_weights(self, alpha, discount, reward, maxval, prevval,
                       features):
        """
            Perform an update of the weights here according to the Q-learning
            weight update rule described in the homework handout.
        """
        for i in range(len(self.weights)):
            self.weights[i] += alpha * (reward + discount * maxval -
                                        prevval) * features[i]

    def explore_and_exploit(self, ant):
        '''
        Update weights and decide whether to explore or exploit here.  Where all the magic happens.
        YOUR CODE HERE
        '''

        actions = self.world.get_passable_directions(ant.location, AIM.keys())
        random.shuffle(actions)

        if len(actions) == 0:
            return 'halt'
        # if we have a newborn baby ant, init its rewards and quality fcns
        if 'prev_value' not in ant.__dict__:
            ant.prev_value = 0
            ant.previous_reward_events = RewardEvents()
            ant.prev_features = self.features.extract(self.world, self.state,
                                                      ant.location, actions[0])
            return actions[0]
        # step 1, update Q(s,a) based on going from last state, taking
        # the action issued last round, and getting to current state
        R = self.get_reward(ant.previous_reward_events, ant)

        # step size.  it's good to make this inversely proportional to the
        # number of features, so you don't bounce out of the bowl we're trying
        # to descend via gradient descent
        alpha = float(1) / (len(ant.prev_features) * ALPHA_DIVIDER)
        # totally greedy default value, future rewards count for nothing, do not want
        discount = DISCOUNT

        # should be max_a' Q(s',a'), where right now we are in state s' and the
        # previous state was s.  You can use
        # self.value(self.state,ant.location,action) here
        max_next_value = 0
        max_next_action = 'halt'
        for action in actions:
            val = self.value(self.state, ant.location, action)
            if max_next_value < val:
                max_next_value = val
                max_next_action = action

        # should be argmax_a' Q(s',a')
        #max_next_action = 'halt'

        # now that we have all the quantities needed, adjust the weights
        self.update_weights(alpha, discount, R, max_next_value, ant.prev_value,
                            ant.prev_features)

        # step 2, explore or exploit? you should replace decide_to_explore with
        # something sensible based on the number of games played so far, self.ngames

        if self.ngames < EXPLORE_THRESHOLD:
            decide_to_explore = True
        else:
            if random.randint(0, (self.ngames - EXPLORE_THRESHOLD) / 2) == 0:
                decide_to_explore = True
            else:
                decide_to_explore = False
            decide_to_explore = False
        if decide_to_explore:
            return actions[0]
        else:
            return max_next_action
예제 #9
0
파일: qlearner.py 프로젝트: lkirbyl/ANTS-
class QLearnBot(ValueBot):
    
    def __init__(self,world, load_file="save_bots/qbot.json"):
        ValueBot.__init__(self,world, load_file)
        self.nturns = 0
        self.percentSeen = 0
        self.lastPercentSeen = 0
        self.world.stateless = False
    
    def get_reward(self,reward_state):
        """ 
        Hand-tuned rewards for a state.  The RewardState reward_state tracks the
        following events for which you can tailor rewards:
            reward_state.food_eaten: Fraction of a food item this ant ate (between 0 and 1)
            reward_state.was_killed: boolean flag whether the ant died this turn
            reward_state.death_dealt: Fraction of responsibility this ant contributed to killing other ants (e.g., if 2 ants killed an enemy an, each would have death_dealt=1/2
        """
        reward = 0
        if reward_state.death_dealt == 0\
        and reward_state.was_killed is False\
        and reward_state.food_eaten ==0:
            return -.001
        #reward = .1*(self.percentSeen-self.lastPercentSeen)
        if reward_state.death_dealt > 0:
            reward += 1./reward_state.death_dealt
        reward += 10*reward_state.food_eaten-reward_state.was_killed
        reward += 100*reward_state.razed_hills

        return reward
    
    def avoid_collisions(self):
        """ 
        Simple logic to avoid collisions.  No need to touch this function.
        """
        next_locations = {}
        for ant in self.world.ants:
            if ant.status == AntStatus.ALIVE:
                # Basic collision detection: don't land on the same square as another friendly ant.
                nextpos = self.world.next_position(ant.location, ant.direction) 
                if nextpos in next_locations.keys():  
                    ant.direction = None
                else:
                    next_locations[nextpos] = ant.ant_id
                        
    def do_turn(self):
        """
        do_turn just does some bookkeeping and calls the update+explore/exploit 
        loop for each living or just killed ant.  You shouldn't need to modify 
        this function.
        """
        self.nturns += 1
        self.lastPercentSeen = self.percentSeen
        self.percentSeen = len([loc for loc in self.world.map if loc > -5])/self.world.width*self.world.height
        
        # Grid lookup resolution: size 10 squares
        if self.state == None:
            self.state = GlobalState(self.world, resolution=10)
        else:
            self.state.update()
            
        # explore or exploit and update values for every ant that's alive or was just killed
        for ant in self.world.ants:
            if ant.status == AntStatus.ALIVE or ant.previous_reward_events.was_killed:
                ant.direction = self.explore_and_exploit(ant)
                if ant.direction is None or ant.direction is 'halt':
                    ant.direction = None
                
        self.avoid_collisions()
        
        # record features for action taken so we can update when we arrive in the next state next turn
        for ant in self.world.ants:    
            ant.prev_features = self.features.extract(self.world, self.state, ant.location, ant.direction)
            ant.prev_value = self.value(self.state,ant.location,ant.direction)

        print self.world.L.info(str(self))

    def update_weights(self,alpha,discount,reward,maxval,prevval,features):
        """
            Perform an update of the weights here according to the Q-learning
            weight update rule described in the homework handout.
            
            alpha = alpha
            discount = gamma
            reward = R(s)
            maxval = maxQw(s',a')
            prevval = Qw(s,a)

            YOUR CODE HERE
        """
        for i in range(len(self.weights)):
            self.weights[i] += alpha*(reward+discount*maxval-prevval)*features[i]
        

    def explore_and_exploit(self,ant):
        '''
        Update weights and decide whether to explore or exploit here.  Where all the magic happens.
        YOUR CODE HERE
        '''

        actions = self.world.get_passable_directions(ant.location, AIM.keys())
        random.shuffle(actions)
        if len(actions)==0:
            return 'halt'
        
        # if we have a newborn baby ant, init its rewards and quality fcns
        if 'prev_value' not in ant.__dict__:
            ant.prev_value = 0
            ant.previous_reward_events = RewardEvents()
            ant.prev_features = self.features.extract(self.world, self.state, ant.location, actions[0])
        
        # step 1, update Q(s,a) based on going from last state, taking
        # the action issued last round, and getting to current state
        R = self.get_reward(ant.previous_reward_events)
        
        # step size.  it's good to make this inversely proportional to the
        # number of features, so you don't bounce out of the bowl we're trying
        # to descend via gradient descent
        alpha = .0001
        
        # totally greedy default value, future rewards count for nothing, do not want
        discount = 0.5
        
        # should be max_a' Q(s',a'), where right now we are in state s' and the
        # previous state was s.  You can use
        # self.value(self.state,ant.location,action) here
        max_next_value = float('-inf')
        # should be argmax_a' Q(s',a')
        max_next_action = 'halt'
        for action in actions:
            newVal = self.value(self.state,ant.location,action)
            if newVal > max_next_value:
                max_next_value = newVal
                max_next_action = action
        
        # now that we have all the quantities needed, adjust the weights
        self.update_weights(alpha,discount,R,max_next_value,ant.prev_value,ant.prev_features)

                
        # step 2, explore or exploit? you should replace decide_to_explore with
        # something sensible based on the number of games played so far, self.ngames
        decide_to_explore = False
        if random.random() < 1./(self.ngames+1):
            decide_to_explore = True
        if decide_to_explore:
            return actions[0]
        else:      
            return max_next_action
예제 #10
0
class QLearnBot(ValueBot):
    def __init__(self,world, load_file="save_bots/qbot.json"):
        ValueBot.__init__(self,world, load_file)
        self.ngame = 0
    
    def get_reward(self,reward_state):
        """ 
        Hand-tuned rewards for a state.  The RewardState reward_state tracks the
        following events for which you can tailor rewards:
            reward_state.food_eaten: Fraction of a food item this ant ate (between 0 and 1)
            reward_state.was_killed: boolean flag whether the ant died this turn
            reward_state.death_dealt: Fraction of responsibility this ant contributed to killing other ants (e.g., if 2 ants killed an enemy an, each would have death_dealt=1/2
        """

        food_reward = 2.0 * (reward_state.food_eaten)
        killer_reward = 0.3 * reward_state.death_dealt
        death_reward = (-0.8) * reward_state.was_killed
        reward = food_reward + killer_reward + death_reward
        return reward
    
    def avoid_collisions(self):
        """ 
        Simple logic to avoid collisions.  No need to touch this function.
        """
        next_locations = {}
        for ant in self.world.ants:
            if ant.status == AntStatus.ALIVE:
                # Basic collision detection: don't land on the same square as another friendly ant.
                nextpos = self.world.next_position(ant.location, ant.direction) 
                if nextpos in next_locations.keys():  
                    ant.direction = None
                else:
                    next_locations[nextpos] = ant.ant_id
                        
    def do_turn(self):
        """
        do_turn just does some bookkeeping and calls the update+explore/exploit 
        loop for each living or just killed ant.  You shouldn't need to modify 
        this function.
        """
        
        # Grid lookup resolution: size 10 squares
        if self.state == None:
            self.state = GlobalState(self.world, resolution=10)
        else:
            self.state.update()
            
        # explore or exploit and update values for every ant that's alive or was just killed
        for ant in self.world.ants:
            if ant.status == AntStatus.ALIVE or ant.previous_reward_events.was_killed:
                ant.direction = self.explore_and_exploit(ant)
                
        self.avoid_collisions()
        
        # record features for action taken so we can update when we arrive in the next state next turn
        for ant in self.world.ants:    
            ant.prev_features = self.features.extract(self.world, self.state, ant.location, ant.direction)
            ant.prev_value = self.value(self.state,ant.location,ant.direction)

        print self.world.L.info(str(self))

    def update_weights(self,alpha,discount,reward,maxval,prevval,features):
        """
            Perform an update of the weights here according to the Q-learning
            weight update rule described in the homework handout.
        """
        print("weights:")
        for i in range(len(self.weights)):
            self.weights[i] += alpha * (reward + discount * maxval - prevval) * features[i]
            #if self.weights[i]:
              #print(str(i) + ": " + str(self.weights[i]))
        

    def explore_and_exploit(self,ant):
        '''
        Update weights and decide whether to explore or exploit here.  Where all the magic happens.
        YOUR CODE HERE
        '''

        actions = self.world.get_passable_directions(ant.location, AIM.keys())
        random.shuffle(actions)
        if len(actions)==0:
            return 'halt'
        
        # if we have a newborn baby ant, init its rewards and quality fcns
        if 'prev_value' not in ant.__dict__:
            ant.prev_value = 0
            ant.previous_reward_events = RewardEvents()
            ant.prev_features = self.features.extract(self.world, self.state, ant.location, actions[0])
            return actions[0]
        
        # step 1, update Q(s,a) based on going from last state, taking
        # the action issued last round, and getting to current state
        R = self.get_reward(ant.previous_reward_events)
        
        # step size.  it's good to make this inversely proportional to the
        # number of features, so you don't bounce out of the bowl we're trying
        # to descend via gradient descent
        alpha = 0.01 / (len(self.weights))
        
        # totally greedy default value, future rewards count for nothing, do not want
        discount = 1.0
        
        # should be max_a' Q(s',a'), where right now we are in state s' and the
        # previous state was s.  You can use
        # self.value(self.state,ant.location,action) here

        # SO WHY NOT JUST PUT THAT IN THE CODE INSTEAD OF LEAVING A CRYPTIC COMMENT!?
        (max_next_value, max_next_action) = max_by(actions, lambda x: self.value(self.state,ant.location,x))
        
        # now that we have all the quantities needed, adjust the weights
        self.update_weights(alpha,discount,R,max_next_value,ant.prev_value,ant.prev_features)

                
        # step 2, explore or exploit? you should replace decide_to_explore with
        # something sensible based on the number of games played so far, self.ngames
        decide_to_explore = None 
        if self.ngames < explore_start:
          decide_to_explore = True
        elif self.ngames < explore_stop:
          p = 1.0 * (explore_stop - self.ngames) / (explore_stop - explore_start)
          decide_to_explore = random.random() < p
        else:
          decide_to_explore = False

        if decide_to_explore:
            return actions[0]
        else:      
            return max_next_action
예제 #11
0
class ValueBot(AntsBot):
    """ Value function based AntsBot.

    This is a template class that uses a FeatureExtractor and a set of weights to make decisions
    based on a weighted sum of features (value function.) It is capable of loading and saving to JSON using
    the FeatureExtractor.to_dict() method and FeatureExtractor(input_dict) constructor.
      
    """
    
    def __init__(self, world, load_file="valuebot.json"):
        """Initialize, optionally loading from file. 
        
        Note: this bot disables tracking of friendly ants in the AntWorld, 
        so that ant_id is no longer consistent between turns. This speeds up
        game speed dramatically, but means it is trickier to maintain specific ant states.
        """
        
        AntsBot.__init__(self, world)
        self.state = None
        self.features = None
        self.weights = None
        
        # **** NOTE: Disable ant tracking to speed up game playing. 
        self.world.stateless = False
        
        # Try to load saved configuration from file
        if load_file is not None and os.path.exists(load_file):
            fp = file(load_file, "r")
            data = json.load(fp)
            self.set_features(FeatureExtractor(data['features']))
            self.set_weights(data['weights'])
            fp.close()
    
    def save(self, filename):
        """Save features and weights to file."""
        
        fp = file(filename, "w")
        data = {'features': self.features.to_dict(), 
                'weights': self.weights }
        json.dump(data, fp)
        fp.close()
        
        
    def save_readable(self, filename):
        """Save features and weights to file."""
        
        fp = file(filename, "w")
        fp.write(str(self))
        0
        
        fp.close()
            
    def __str__(self):
        """Print a labeled list of weight values."""
        
        s = 'ValueBot:\n'
        for i in range(self.features.num_features()):
            s += '\t%s = %g\n' % (self.features.feature_name(i), 
                                  self.weights[i])
        return s
    
    def set_features(self, extractor):
        self.features = extractor
        self.world.L.debug("Setting features: %s" % str(self.features))
        
    def set_weights(self, weights):
        """Set weight vector. Note: checks that len(weights) == self.features.num_features()."""                    
                
        self.weights = weights
        self.world.L.debug("Setting weights: %s" % str(self.weights))
        if self.features == None or not len(self.weights) == self.features.num_features():
            raise AssertionError("Features need to be set before weights!")

    def value(self, state, loc, action):
        """Compute the value of a given action w.r.t. a given state and ant location."""
        
        feature_vector = self.features.extract(self.world, state, loc, action)
        
#        self.world.L.info("Evaluating move: %s, %s:" % (str(loc), action))
        dot_product = 0
        for i in range(0, len(feature_vector)):
            if feature_vector[i]:
#                self.world.L.info("\tf: %s = %g" % (self.features.feature_name(i), self.weights[i]))
                dot_product += self.weights[i]
#        self.world.L.info("\tdot_product = %g" % dot_product)
        
        return dot_product
             
    def get_direction(self, ant):
        """Evaluates each of the currently passable directions and picks the one with maximum value."""
        
        # get the passable directions, in random order to break ties
        rand_dirs = self.world.get_passable_directions(ant.location, AIM.keys())
        random.shuffle(rand_dirs)
        
        # evaluate the value function for each possible direction
        value = [0 for i in range(0, len(rand_dirs))]
        max_value = float("-inf")
        max_dir = None
        for i in range(0, len(rand_dirs)):
            value[i] = self.value(self.state, ant.location, rand_dirs[i])
            if value[i] > max_value:
                max_value = value[i]
                max_dir = rand_dirs[i]
                
        # take direction with maximum value
        # Get the first passable direction from that long list.
        self.world.L.info("Chose: %s, value: %.2f" % (max_dir, max_value))
        return max_dir

    # Main logic
    def do_turn(self):
        """Precomputes global state, and then chooses max value action for each ant independently."""
        
        # Run the routine for each living ant independently.
        next_locations = {}
        
        # Grid lookup resolution: size 10 squares
        if self.state == None:
            self.state = GlobalState(self.world, resolution=10)
        else:
            self.state.update()
        
        for ant in self.world.ants:
            if ant.status == AntStatus.ALIVE:
                ant.direction = self.get_direction(ant)
                if ant.direction == 'halt' or ant.direction == None:
                    ant.direction = None
                else:
                    # Basic collision detection: don't land on the same square as another friendly ant.
                    nextpos = self.world.next_position(ant.location, ant.direction) 
                    if nextpos in next_locations.keys():  
                        ant.direction = None
                    else:
                        next_locations[nextpos] = ant.ant_id

    def reset(self):
        self.state = None
예제 #12
0
class QLearnBot(ValueBot):
    
    def __init__(self,world, load_file="save_bots/qbot.json"):
        self.world = world
        ValueBot.__init__(self,world, load_file)
        self.nturns = 0
        self.pathfinder = None
    
    def get_reward(self, reward_state, ant):
        """ 
        Hand-tuned rewards for a state.  The RewardState reward_state tracks the
        following events for which you can tailor rewards:
            reward_state.food_eaten: Fraction of a food item this ant ate (between 0 and 1)
            reward_state.was_killed: boolean flag whether the ant died this turn
            reward_state.death_dealt: Fraction of responsibility this ant contributed to killing other ants (e.g., if 2 ants killed an enemy an, each would have death_dealt=1/2
            reward_state.hill_razed: 
            reward_state.hill_distance: Fraction, 1/x
        """
        
        
        if self.state.get_visited(ant.location) == 0:
            explore_bonus = 1
        else:
            explore_bonus = float(1)/(self.state.get_visited(ant.location)*100)
                                  
        
        
        print ":::::Reward Info::::"
        print "food_eaten: "+str(reward_state.food_eaten)
        print "was_killed: "+str(reward_state.was_killed)
        print "death_dealt: "+str(reward_state.death_dealt)
        print "hill_razed: "+str(reward_state.razed_hill)
        print "hill_distance: "+str(reward_state.hill_distance)
        print "friendly hill razed: "+str(reward_state.friendy_hill_razed)
        print "exploration bonus: "+str(explore_bonus)
        print "::::::::::::::::::::"
    
        
        reward = 0
        reward += LIVING_REWARD
        reward += FOOD_REWARD*reward_state.food_eaten
        reward += DEATH_REWARD*reward_state.was_killed
        reward += KILL_REWARD*reward_state.death_dealt;
        reward += RAZED_REWARD*reward_state.razed_hill;
        reward += HILL_DISTANCE_REWARD*reward_state.hill_distance;
        reward += FRIENDLY_HILL_RAZED_REWARD*reward_state.friendy_hill_razed;
        reward += EXPLORE_BONUS*explore_bonus;
        
        return reward
        
    def set_pathfinder(self, pathfinder):
        self.pathfinder = pathfinder
    
    
    def avoid_collisions(self):
        """ 
        Simple logic to avoid collisions.  No need to touch this function.
        """
        next_locations = {}
        for ant in self.world.ants:
            if ant.status == AntStatus.ALIVE:
                # Basic collision detection: don't land on the same square as another friendly ant.
                nextpos = self.world.next_position(ant.location, ant.direction) 
                if nextpos in next_locations.keys():  
                    ant.direction = 'halt'
                else:
                    next_locations[nextpos] = ant.ant_id
                        
    def do_turn(self):
        """
        do_turn just does some bookkeeping and calls the update+explore/exploit 
        loop for each living or just killed ant.  You shouldn't need to modify 
        this function.
        """
        
        # Grid lookup resolution: size 10 squares
        if self.state == None:
            fog = int(math.sqrt(self.world.viewradius2)/2)
            self.state = GlobalState(self.world, visited_resolution=fog, resolution=fog)
        else:
            self.state.update()
            
        # explore or exploit and update values for every ant that's alive or was just killed
        for ant in self.world.ants:
            if ant.status == AntStatus.ALIVE or ant.previous_reward_events.was_killed:
                ant.direction = self.explore_and_exploit(ant)

        self.avoid_collisions()
        
        
        # record features for action taken so we can update when we arrive in the next state next turn
        for ant in self.world.ants:    
            ant.prev_features = self.features.extract(self.world, self.state, ant.location, ant.direction)
            ant.prev_value = self.value(self.state,ant.location,ant.direction)

    def update_weights(self,alpha,discount,reward,maxval,prevval,features):
        """
            Perform an update of the weights here according to the Q-learning
            weight update rule described in the homework handout.
        """
        for i in range(len(self.weights)):
            self.weights[i] += alpha*(reward + discount*maxval - prevval)*features[i]
        

    def explore_and_exploit(self,ant):
        '''
        Update weights and decide whether to explore or exploit here.  Where all the magic happens.
        YOUR CODE HERE
        '''

        actions = self.world.get_passable_directions(ant.location, AIM.keys())
        random.shuffle(actions)
        
        if len(actions)==0:
            return 'halt'
        # if we have a newborn baby ant, init its rewards and quality fcns
        if 'prev_value' not in ant.__dict__:
            ant.prev_value = 0
            ant.previous_reward_events = RewardEvents()
            ant.prev_features = self.features.extract(self.world, self.state, ant.location, actions[0])
            return actions[0]
        # step 1, update Q(s,a) based on going from last state, taking
        # the action issued last round, and getting to current state
        R = self.get_reward(ant.previous_reward_events, ant)
        
        # step size.  it's good to make this inversely proportional to the
        # number of features, so you don't bounce out of the bowl we're trying
        # to descend via gradient descent
        alpha = float(1) / (len(ant.prev_features)*ALPHA_DIVIDER)
        # totally greedy default value, future rewards count for nothing, do not want
        discount = DISCOUNT
        
        # should be max_a' Q(s',a'), where right now we are in state s' and the
        # previous state was s.  You can use
        # self.value(self.state,ant.location,action) here
        max_next_value = 0
        max_next_action = 'halt'
        for action in actions:
            val = self.value(self.state,ant.location,action)
            if max_next_value < val:
                max_next_value = val
                max_next_action = action
        
        # should be argmax_a' Q(s',a')
        #max_next_action = 'halt'
        
        # now that we have all the quantities needed, adjust the weights
        self.update_weights(alpha,discount,R,max_next_value,ant.prev_value,ant.prev_features)

                
        # step 2, explore or exploit? you should replace decide_to_explore with
        # something sensible based on the number of games played so far, self.ngames
        
        if self.ngames < EXPLORE_THRESHOLD:
            decide_to_explore = True
        else:
            if random.randint(0,(self.ngames-EXPLORE_THRESHOLD)/2) == 0:
                decide_to_explore = True
            else:
                decide_to_explore = False
            decide_to_explore = False
        if decide_to_explore:
            return actions[0]
        else:      
            return max_next_action