def do_turn(self): """ do_turn just does some bookkeeping and calls the update+explore/exploit loop for each living or just killed ant. You shouldn't need to modify this function. """ # Grid lookup resolution: size 10 squares if self.state == None: self.state = GlobalState(self.world, resolution=10) else: self.state.update() # explore or exploit and update values for every ant that's alive or was just killed for ant in self.world.ants: if ant.status == AntStatus.ALIVE or ant.previous_reward_events.was_killed: ant.direction = self.explore_and_exploit(ant) self.avoid_collisions() # record features for action taken so we can update when we arrive in the next state next turn for ant in self.world.ants: ant.prev_features = self.features.extract(self.world, self.state, ant.location, ant.direction) ant.prev_value = self.value(self.state, ant.location, ant.direction) print self.world.L.info(str(self))
def do_turn(self): """Precomputes global state, and then chooses max value action for each ant independently.""" # Run the routine for each living ant independently. next_locations = {} print "Dddddddddd" # Grid lookup resolution: size 10 squares if self.state == None: self.state = GlobalState(self.world, resolution=FOG) else: self.state.update() for ant in self.world.ants: if ant.status == AntStatus.ALIVE: ant.direction = self.get_direction(ant) if ant.direction == 'halt' or ant.direction == None: ant.direction = None else: # Basic collision detection: don't land on the same square as another friendly ant. nextpos = self.world.next_position(ant.location, ant.direction) if nextpos in next_locations.keys(): ant.direction = None else: next_locations[nextpos] = ant.ant_id
def do_turn(self): """ do_turn just does some bookkeeping and calls the update+explore/exploit loop for each living or just killed ant. You shouldn't need to modify this function. """ self.nturns += 1 self.lastPercentSeen = self.percentSeen self.percentSeen = len([loc for loc in self.world.map if loc > -5])/self.world.width*self.world.height # Grid lookup resolution: size 10 squares if self.state == None: self.state = GlobalState(self.world, resolution=10) else: self.state.update() # explore or exploit and update values for every ant that's alive or was just killed for ant in self.world.ants: if ant.status == AntStatus.ALIVE or ant.previous_reward_events.was_killed: ant.direction = self.explore_and_exploit(ant) if ant.direction is None or ant.direction is 'halt': ant.direction = None self.avoid_collisions() # record features for action taken so we can update when we arrive in the next state next turn for ant in self.world.ants: ant.prev_features = self.features.extract(self.world, self.state, ant.location, ant.direction) ant.prev_value = self.value(self.state,ant.location,ant.direction) print self.world.L.info(str(self))
def do_turn(self): """ do_turn just does some bookkeeping and calls the update+explore/exploit loop for each living or just killed ant. You shouldn't need to modify this function. """ # Grid lookup resolution: size 10 squares if self.state == None: fog = int(math.sqrt(self.world.viewradius2)/2) self.state = GlobalState(self.world, visited_resolution=fog, resolution=fog) else: self.state.update() # explore or exploit and update values for every ant that's alive or was just killed for ant in self.world.ants: if ant.status == AntStatus.ALIVE or ant.previous_reward_events.was_killed: ant.direction = self.explore_and_exploit(ant) self.avoid_collisions() # record features for action taken so we can update when we arrive in the next state next turn for ant in self.world.ants: ant.prev_features = self.features.extract(self.world, self.state, ant.location, ant.direction) ant.prev_value = self.value(self.state,ant.location,ant.direction)
class QLearnBot(ValueBot): def __init__(self, world, load_file="save_bots/qbot.json"): ValueBot.__init__(self, world, load_file) self.ngame = 0 def get_reward(self, reward_state): """ Hand-tuned rewards for a state. The RewardState reward_state tracks the following events for which you can tailor rewards: reward_state.food_eaten: Fraction of a food item this ant ate (between 0 and 1) reward_state.was_killed: boolean flag whether the ant died this turn reward_state.death_dealt: Fraction of responsibility this ant contributed to killing other ants (e.g., if 2 ants killed an enemy ant, each would have death_dealt=1/2 """ ''' YOUR CODE HERE ''' reward = reward_state.food_eaten * .7 + reward_state.death_dealt * .4 + reward_state.was_killed * -.00001 return reward def avoid_collisions(self): """ Simple logic to avoid collisions. DO NOT TOUCH. """ next_locations = {} for ant in self.world.ants: if ant.status == AntStatus.ALIVE: # Basic collision detection: don't land on the same square as another friendly ant. nextpos = self.world.next_position(ant.location, ant.direction) if nextpos in next_locations.keys(): ant.direction = None else: next_locations[nextpos] = ant.ant_id def do_turn(self): """ do_turn just does some bookkeeping and calls the update+explore/exploit loop for each living or just killed ant. You shouldn't need to modify this function. """ # Grid lookup resolution: size 10 squares if self.state == None: self.state = GlobalState(self.world, resolution=10) else: self.state.update() # explore or exploit and update values for every ant that's alive or was just killed for ant in self.world.ants: if ant.status == AntStatus.ALIVE or ant.previous_reward_events.was_killed: ant.direction = self.explore_and_exploit(ant) self.avoid_collisions() # record features for action taken so we can update when we arrive in the next state next turn for ant in self.world.ants: ant.prev_features = self.features.extract(self.world, self.state, ant.location, ant.direction) ant.prev_value = self.value(self.state, ant.location, ant.direction) print self.world.L.info(str(self)) def update_weights(self, alpha, discount, reward, maxval, prevval, features): """ Perform an update of the weights here according to the Q-learning weight update rule described in the homework handout. YOUR CODE HERE """ for i in range(len(self.weights)): self.weights[i] += alpha * (reward + (discount * maxval) - prevval) * features[i] def explore_and_exploit(self, ant): ''' Update weights and decide whether to explore or exploit here. Where all the magic happens. YOUR CODE HERE ''' actions = self.world.get_passable_directions(ant.location, AIM.keys()) random.shuffle(actions) if len(actions) == 0: return 'halt' # if we have a newborn baby ant, init its rewards and quality fcns if 'prev_value' not in ant.__dict__: ant.prev_value = 0 ant.previous_reward_events = RewardEvents() ant.prev_features = self.features.extract(self.world, self.state, ant.location, actions[0]) return actions[0] # step 1, update Q(s,a) based on going from last state, taking # the action issued last round, and getting to current state R = self.get_reward(ant.previous_reward_events) # step size. it's good to make this inversely proportional to the # number of features, so you don't bounce out of the bowl we're trying # to descend via gradient descent alpha = 0.00001 # totally greedy default value, future rewards count for nothing, do not want discount = 0.00001 # should be max_a' Q(s',a'), where right now we are in state s' and the # previous state was s. You can use # self.value(self.state,ant.location,action) here max_next_action = 'halt' max_next_value = self.value(self.state, ant.location, actions[0]) for action in actions: value = self.value(self.state, ant.location, action) if value > max_next_value: max_next_value = value max_next_action = action # should be argmax_a' Q(s',a') # now that we have all the quantities needed, adjust the weights self.update_weights(alpha, discount, R, max_next_value, ant.prev_value, ant.prev_features) # step 2, explore or exploit? you should replace decide_to_explore with # something sensible based on the number of games played so far, self.ngames explore = 0.7 / self.ngames decision = random.random() if explore >= decision: return actions[0] else: return max_next_action
class ValueBot(AntsBot): """ Value function based AntsBot. This is a template class that uses a FeatureExtractor and a set of weights to make decisions based on a weighted sum of features (value function.) It is capable of loading and saving to JSON using the FeatureExtractor.to_dict() method and FeatureExtractor(input_dict) constructor. """ def __init__(self, world, load_file="valuebot.json"): """Initialize, optionally loading from file. Note: this bot disables tracking of friendly ants in the AntWorld, so that ant_id is no longer consistent between turns. This speeds up game speed dramatically, but means it is trickier to maintain specific ant states. """ AntsBot.__init__(self, world) self.state = None self.features = None self.weights = None self.fog = world.viewradius2 # **** NOTE: Disable ant tracking to speed up game playing. self.world.stateless = False # Try to load saved configuration from file if load_file is not None and os.path.exists(load_file): fp = file(load_file, "r") data = json.load(fp) self.set_features(FeatureExtractor(data['features'])) self.set_weights(data['weights']) fp.close() def save(self, filename): """Save features and weights to file.""" fp = file(filename, "w") data = {'features': self.features.to_dict(), 'weights': self.weights} json.dump(data, fp) fp.close() def __str__(self): """Print a labeled list of weight values.""" s = 'ValueBot:\n' for i in range(self.features.num_features()): s += '\t%s = %g\n' % (self.features.feature_name(i), self.weights[i]) return s def set_features(self, extractor): self.features = extractor self.world.L.debug("Setting features: %s" % str(self.features)) def set_weights(self, weights): """Set weight vector. Note: checks that len(weights) == self.features.num_features().""" self.weights = weights self.world.L.debug("Setting weights: %s" % str(self.weights)) if self.features == None or not len( self.weights) == self.features.num_features(): raise AssertionError("Features need to be set before weights!") def value(self, state, loc, action): """Compute the value of a given action w.r.t. a given state and ant location.""" feature_vector = self.features.extract(self.world, state, loc, action) self.world.L.info("Evaluating move: %s, %s:" % (str(loc), action)) dot_product = 0 for i in range(0, len(feature_vector)): if feature_vector[i]: self.world.L.info( "\tf: %s = %g" % (self.features.feature_name(i), self.weights[i])) dot_product += self.weights[i] self.world.L.info("\tdot_product = %g" % dot_product) return dot_product def get_direction(self, ant): """Evaluates each of the currently passable directions and picks the one with maximum value.""" # get the passable directions, in random order to break ties rand_dirs = self.world.get_passable_directions(ant.location, AIM.keys()) random.shuffle(rand_dirs) # evaluate the value function for each possible direction value = [0 for i in range(0, len(rand_dirs))] max_value = float("-inf") max_dir = None for i in range(0, len(rand_dirs)): value[i] = self.value(self.state, ant.location, rand_dirs[i]) if value[i] > max_value: max_value = value[i] max_dir = rand_dirs[i] # take direction with maximum value # Get the first passable direction from that long list. self.world.L.info("Chose: %s, value: %.2f" % (max_dir, max_value)) return max_dir # Main logic def do_turn(self): """Precomputes global state, and then chooses max value action for each ant independently.""" # Run the routine for each living ant independently. next_locations = {} print "Dddddddddd" # Grid lookup resolution: size 10 squares if self.state == None: self.state = GlobalState(self.world, resolution=FOG) else: self.state.update() for ant in self.world.ants: if ant.status == AntStatus.ALIVE: ant.direction = self.get_direction(ant) if ant.direction == 'halt' or ant.direction == None: ant.direction = None else: # Basic collision detection: don't land on the same square as another friendly ant. nextpos = self.world.next_position(ant.location, ant.direction) if nextpos in next_locations.keys(): ant.direction = None else: next_locations[nextpos] = ant.ant_id def reset(self): self.state = None
class QLearnBot(ValueBot): def __init__(self, world, load_file="save_bots/qbot.json"): self.world = world ValueBot.__init__(self, world, load_file) self.nturns = 0 self.pathfinder = None def get_reward(self, reward_state, ant): """ Hand-tuned rewards for a state. The RewardState reward_state tracks the following events for which you can tailor rewards: reward_state.food_eaten: Fraction of a food item this ant ate (between 0 and 1) reward_state.was_killed: boolean flag whether the ant died this turn reward_state.death_dealt: Fraction of responsibility this ant contributed to killing other ants (e.g., if 2 ants killed an enemy an, each would have death_dealt=1/2 reward_state.hill_razed: reward_state.hill_distance: Fraction, 1/x """ if self.state.get_visited(ant.location) == 0: explore_bonus = 1 else: explore_bonus = float(1) / (self.state.get_visited(ant.location) * 100) print ":::::Reward Info::::" print "food_eaten: " + str(reward_state.food_eaten) print "was_killed: " + str(reward_state.was_killed) print "death_dealt: " + str(reward_state.death_dealt) print "hill_razed: " + str(reward_state.razed_hill) print "hill_distance: " + str(reward_state.hill_distance) print "friendly hill razed: " + str(reward_state.friendy_hill_razed) print "exploration bonus: " + str(explore_bonus) print "::::::::::::::::::::" reward = 0 reward += LIVING_REWARD reward += FOOD_REWARD * reward_state.food_eaten reward += DEATH_REWARD * reward_state.was_killed reward += KILL_REWARD * reward_state.death_dealt reward += RAZED_REWARD * reward_state.razed_hill reward += HILL_DISTANCE_REWARD * reward_state.hill_distance reward += FRIENDLY_HILL_RAZED_REWARD * reward_state.friendy_hill_razed reward += EXPLORE_BONUS * explore_bonus return reward def set_pathfinder(self, pathfinder): self.pathfinder = pathfinder def avoid_collisions(self): """ Simple logic to avoid collisions. No need to touch this function. """ next_locations = {} for ant in self.world.ants: if ant.status == AntStatus.ALIVE: # Basic collision detection: don't land on the same square as another friendly ant. nextpos = self.world.next_position(ant.location, ant.direction) if nextpos in next_locations.keys(): ant.direction = 'halt' else: next_locations[nextpos] = ant.ant_id def do_turn(self): """ do_turn just does some bookkeeping and calls the update+explore/exploit loop for each living or just killed ant. You shouldn't need to modify this function. """ # Grid lookup resolution: size 10 squares if self.state == None: fog = int(math.sqrt(self.world.viewradius2) / 2) self.state = GlobalState(self.world, visited_resolution=fog, resolution=fog) else: self.state.update() # explore or exploit and update values for every ant that's alive or was just killed for ant in self.world.ants: if ant.status == AntStatus.ALIVE or ant.previous_reward_events.was_killed: ant.direction = self.explore_and_exploit(ant) self.avoid_collisions() # record features for action taken so we can update when we arrive in the next state next turn for ant in self.world.ants: ant.prev_features = self.features.extract(self.world, self.state, ant.location, ant.direction) ant.prev_value = self.value(self.state, ant.location, ant.direction) def update_weights(self, alpha, discount, reward, maxval, prevval, features): """ Perform an update of the weights here according to the Q-learning weight update rule described in the homework handout. """ for i in range(len(self.weights)): self.weights[i] += alpha * (reward + discount * maxval - prevval) * features[i] def explore_and_exploit(self, ant): ''' Update weights and decide whether to explore or exploit here. Where all the magic happens. YOUR CODE HERE ''' actions = self.world.get_passable_directions(ant.location, AIM.keys()) random.shuffle(actions) if len(actions) == 0: return 'halt' # if we have a newborn baby ant, init its rewards and quality fcns if 'prev_value' not in ant.__dict__: ant.prev_value = 0 ant.previous_reward_events = RewardEvents() ant.prev_features = self.features.extract(self.world, self.state, ant.location, actions[0]) return actions[0] # step 1, update Q(s,a) based on going from last state, taking # the action issued last round, and getting to current state R = self.get_reward(ant.previous_reward_events, ant) # step size. it's good to make this inversely proportional to the # number of features, so you don't bounce out of the bowl we're trying # to descend via gradient descent alpha = float(1) / (len(ant.prev_features) * ALPHA_DIVIDER) # totally greedy default value, future rewards count for nothing, do not want discount = DISCOUNT # should be max_a' Q(s',a'), where right now we are in state s' and the # previous state was s. You can use # self.value(self.state,ant.location,action) here max_next_value = 0 max_next_action = 'halt' for action in actions: val = self.value(self.state, ant.location, action) if max_next_value < val: max_next_value = val max_next_action = action # should be argmax_a' Q(s',a') #max_next_action = 'halt' # now that we have all the quantities needed, adjust the weights self.update_weights(alpha, discount, R, max_next_value, ant.prev_value, ant.prev_features) # step 2, explore or exploit? you should replace decide_to_explore with # something sensible based on the number of games played so far, self.ngames if self.ngames < EXPLORE_THRESHOLD: decide_to_explore = True else: if random.randint(0, (self.ngames - EXPLORE_THRESHOLD) / 2) == 0: decide_to_explore = True else: decide_to_explore = False decide_to_explore = False if decide_to_explore: return actions[0] else: return max_next_action
class QLearnBot(ValueBot): def __init__(self,world, load_file="save_bots/qbot.json"): ValueBot.__init__(self,world, load_file) self.nturns = 0 self.percentSeen = 0 self.lastPercentSeen = 0 self.world.stateless = False def get_reward(self,reward_state): """ Hand-tuned rewards for a state. The RewardState reward_state tracks the following events for which you can tailor rewards: reward_state.food_eaten: Fraction of a food item this ant ate (between 0 and 1) reward_state.was_killed: boolean flag whether the ant died this turn reward_state.death_dealt: Fraction of responsibility this ant contributed to killing other ants (e.g., if 2 ants killed an enemy an, each would have death_dealt=1/2 """ reward = 0 if reward_state.death_dealt == 0\ and reward_state.was_killed is False\ and reward_state.food_eaten ==0: return -.001 #reward = .1*(self.percentSeen-self.lastPercentSeen) if reward_state.death_dealt > 0: reward += 1./reward_state.death_dealt reward += 10*reward_state.food_eaten-reward_state.was_killed reward += 100*reward_state.razed_hills return reward def avoid_collisions(self): """ Simple logic to avoid collisions. No need to touch this function. """ next_locations = {} for ant in self.world.ants: if ant.status == AntStatus.ALIVE: # Basic collision detection: don't land on the same square as another friendly ant. nextpos = self.world.next_position(ant.location, ant.direction) if nextpos in next_locations.keys(): ant.direction = None else: next_locations[nextpos] = ant.ant_id def do_turn(self): """ do_turn just does some bookkeeping and calls the update+explore/exploit loop for each living or just killed ant. You shouldn't need to modify this function. """ self.nturns += 1 self.lastPercentSeen = self.percentSeen self.percentSeen = len([loc for loc in self.world.map if loc > -5])/self.world.width*self.world.height # Grid lookup resolution: size 10 squares if self.state == None: self.state = GlobalState(self.world, resolution=10) else: self.state.update() # explore or exploit and update values for every ant that's alive or was just killed for ant in self.world.ants: if ant.status == AntStatus.ALIVE or ant.previous_reward_events.was_killed: ant.direction = self.explore_and_exploit(ant) if ant.direction is None or ant.direction is 'halt': ant.direction = None self.avoid_collisions() # record features for action taken so we can update when we arrive in the next state next turn for ant in self.world.ants: ant.prev_features = self.features.extract(self.world, self.state, ant.location, ant.direction) ant.prev_value = self.value(self.state,ant.location,ant.direction) print self.world.L.info(str(self)) def update_weights(self,alpha,discount,reward,maxval,prevval,features): """ Perform an update of the weights here according to the Q-learning weight update rule described in the homework handout. alpha = alpha discount = gamma reward = R(s) maxval = maxQw(s',a') prevval = Qw(s,a) YOUR CODE HERE """ for i in range(len(self.weights)): self.weights[i] += alpha*(reward+discount*maxval-prevval)*features[i] def explore_and_exploit(self,ant): ''' Update weights and decide whether to explore or exploit here. Where all the magic happens. YOUR CODE HERE ''' actions = self.world.get_passable_directions(ant.location, AIM.keys()) random.shuffle(actions) if len(actions)==0: return 'halt' # if we have a newborn baby ant, init its rewards and quality fcns if 'prev_value' not in ant.__dict__: ant.prev_value = 0 ant.previous_reward_events = RewardEvents() ant.prev_features = self.features.extract(self.world, self.state, ant.location, actions[0]) # step 1, update Q(s,a) based on going from last state, taking # the action issued last round, and getting to current state R = self.get_reward(ant.previous_reward_events) # step size. it's good to make this inversely proportional to the # number of features, so you don't bounce out of the bowl we're trying # to descend via gradient descent alpha = .0001 # totally greedy default value, future rewards count for nothing, do not want discount = 0.5 # should be max_a' Q(s',a'), where right now we are in state s' and the # previous state was s. You can use # self.value(self.state,ant.location,action) here max_next_value = float('-inf') # should be argmax_a' Q(s',a') max_next_action = 'halt' for action in actions: newVal = self.value(self.state,ant.location,action) if newVal > max_next_value: max_next_value = newVal max_next_action = action # now that we have all the quantities needed, adjust the weights self.update_weights(alpha,discount,R,max_next_value,ant.prev_value,ant.prev_features) # step 2, explore or exploit? you should replace decide_to_explore with # something sensible based on the number of games played so far, self.ngames decide_to_explore = False if random.random() < 1./(self.ngames+1): decide_to_explore = True if decide_to_explore: return actions[0] else: return max_next_action
class QLearnBot(ValueBot): def __init__(self,world, load_file="save_bots/qbot.json"): ValueBot.__init__(self,world, load_file) self.ngame = 0 def get_reward(self,reward_state): """ Hand-tuned rewards for a state. The RewardState reward_state tracks the following events for which you can tailor rewards: reward_state.food_eaten: Fraction of a food item this ant ate (between 0 and 1) reward_state.was_killed: boolean flag whether the ant died this turn reward_state.death_dealt: Fraction of responsibility this ant contributed to killing other ants (e.g., if 2 ants killed an enemy an, each would have death_dealt=1/2 """ food_reward = 2.0 * (reward_state.food_eaten) killer_reward = 0.3 * reward_state.death_dealt death_reward = (-0.8) * reward_state.was_killed reward = food_reward + killer_reward + death_reward return reward def avoid_collisions(self): """ Simple logic to avoid collisions. No need to touch this function. """ next_locations = {} for ant in self.world.ants: if ant.status == AntStatus.ALIVE: # Basic collision detection: don't land on the same square as another friendly ant. nextpos = self.world.next_position(ant.location, ant.direction) if nextpos in next_locations.keys(): ant.direction = None else: next_locations[nextpos] = ant.ant_id def do_turn(self): """ do_turn just does some bookkeeping and calls the update+explore/exploit loop for each living or just killed ant. You shouldn't need to modify this function. """ # Grid lookup resolution: size 10 squares if self.state == None: self.state = GlobalState(self.world, resolution=10) else: self.state.update() # explore or exploit and update values for every ant that's alive or was just killed for ant in self.world.ants: if ant.status == AntStatus.ALIVE or ant.previous_reward_events.was_killed: ant.direction = self.explore_and_exploit(ant) self.avoid_collisions() # record features for action taken so we can update when we arrive in the next state next turn for ant in self.world.ants: ant.prev_features = self.features.extract(self.world, self.state, ant.location, ant.direction) ant.prev_value = self.value(self.state,ant.location,ant.direction) print self.world.L.info(str(self)) def update_weights(self,alpha,discount,reward,maxval,prevval,features): """ Perform an update of the weights here according to the Q-learning weight update rule described in the homework handout. """ print("weights:") for i in range(len(self.weights)): self.weights[i] += alpha * (reward + discount * maxval - prevval) * features[i] #if self.weights[i]: #print(str(i) + ": " + str(self.weights[i])) def explore_and_exploit(self,ant): ''' Update weights and decide whether to explore or exploit here. Where all the magic happens. YOUR CODE HERE ''' actions = self.world.get_passable_directions(ant.location, AIM.keys()) random.shuffle(actions) if len(actions)==0: return 'halt' # if we have a newborn baby ant, init its rewards and quality fcns if 'prev_value' not in ant.__dict__: ant.prev_value = 0 ant.previous_reward_events = RewardEvents() ant.prev_features = self.features.extract(self.world, self.state, ant.location, actions[0]) return actions[0] # step 1, update Q(s,a) based on going from last state, taking # the action issued last round, and getting to current state R = self.get_reward(ant.previous_reward_events) # step size. it's good to make this inversely proportional to the # number of features, so you don't bounce out of the bowl we're trying # to descend via gradient descent alpha = 0.01 / (len(self.weights)) # totally greedy default value, future rewards count for nothing, do not want discount = 1.0 # should be max_a' Q(s',a'), where right now we are in state s' and the # previous state was s. You can use # self.value(self.state,ant.location,action) here # SO WHY NOT JUST PUT THAT IN THE CODE INSTEAD OF LEAVING A CRYPTIC COMMENT!? (max_next_value, max_next_action) = max_by(actions, lambda x: self.value(self.state,ant.location,x)) # now that we have all the quantities needed, adjust the weights self.update_weights(alpha,discount,R,max_next_value,ant.prev_value,ant.prev_features) # step 2, explore or exploit? you should replace decide_to_explore with # something sensible based on the number of games played so far, self.ngames decide_to_explore = None if self.ngames < explore_start: decide_to_explore = True elif self.ngames < explore_stop: p = 1.0 * (explore_stop - self.ngames) / (explore_stop - explore_start) decide_to_explore = random.random() < p else: decide_to_explore = False if decide_to_explore: return actions[0] else: return max_next_action
class ValueBot(AntsBot): """ Value function based AntsBot. This is a template class that uses a FeatureExtractor and a set of weights to make decisions based on a weighted sum of features (value function.) It is capable of loading and saving to JSON using the FeatureExtractor.to_dict() method and FeatureExtractor(input_dict) constructor. """ def __init__(self, world, load_file="valuebot.json"): """Initialize, optionally loading from file. Note: this bot disables tracking of friendly ants in the AntWorld, so that ant_id is no longer consistent between turns. This speeds up game speed dramatically, but means it is trickier to maintain specific ant states. """ AntsBot.__init__(self, world) self.state = None self.features = None self.weights = None # **** NOTE: Disable ant tracking to speed up game playing. self.world.stateless = False # Try to load saved configuration from file if load_file is not None and os.path.exists(load_file): fp = file(load_file, "r") data = json.load(fp) self.set_features(FeatureExtractor(data['features'])) self.set_weights(data['weights']) fp.close() def save(self, filename): """Save features and weights to file.""" fp = file(filename, "w") data = {'features': self.features.to_dict(), 'weights': self.weights } json.dump(data, fp) fp.close() def save_readable(self, filename): """Save features and weights to file.""" fp = file(filename, "w") fp.write(str(self)) 0 fp.close() def __str__(self): """Print a labeled list of weight values.""" s = 'ValueBot:\n' for i in range(self.features.num_features()): s += '\t%s = %g\n' % (self.features.feature_name(i), self.weights[i]) return s def set_features(self, extractor): self.features = extractor self.world.L.debug("Setting features: %s" % str(self.features)) def set_weights(self, weights): """Set weight vector. Note: checks that len(weights) == self.features.num_features().""" self.weights = weights self.world.L.debug("Setting weights: %s" % str(self.weights)) if self.features == None or not len(self.weights) == self.features.num_features(): raise AssertionError("Features need to be set before weights!") def value(self, state, loc, action): """Compute the value of a given action w.r.t. a given state and ant location.""" feature_vector = self.features.extract(self.world, state, loc, action) # self.world.L.info("Evaluating move: %s, %s:" % (str(loc), action)) dot_product = 0 for i in range(0, len(feature_vector)): if feature_vector[i]: # self.world.L.info("\tf: %s = %g" % (self.features.feature_name(i), self.weights[i])) dot_product += self.weights[i] # self.world.L.info("\tdot_product = %g" % dot_product) return dot_product def get_direction(self, ant): """Evaluates each of the currently passable directions and picks the one with maximum value.""" # get the passable directions, in random order to break ties rand_dirs = self.world.get_passable_directions(ant.location, AIM.keys()) random.shuffle(rand_dirs) # evaluate the value function for each possible direction value = [0 for i in range(0, len(rand_dirs))] max_value = float("-inf") max_dir = None for i in range(0, len(rand_dirs)): value[i] = self.value(self.state, ant.location, rand_dirs[i]) if value[i] > max_value: max_value = value[i] max_dir = rand_dirs[i] # take direction with maximum value # Get the first passable direction from that long list. self.world.L.info("Chose: %s, value: %.2f" % (max_dir, max_value)) return max_dir # Main logic def do_turn(self): """Precomputes global state, and then chooses max value action for each ant independently.""" # Run the routine for each living ant independently. next_locations = {} # Grid lookup resolution: size 10 squares if self.state == None: self.state = GlobalState(self.world, resolution=10) else: self.state.update() for ant in self.world.ants: if ant.status == AntStatus.ALIVE: ant.direction = self.get_direction(ant) if ant.direction == 'halt' or ant.direction == None: ant.direction = None else: # Basic collision detection: don't land on the same square as another friendly ant. nextpos = self.world.next_position(ant.location, ant.direction) if nextpos in next_locations.keys(): ant.direction = None else: next_locations[nextpos] = ant.ant_id def reset(self): self.state = None
class QLearnBot(ValueBot): def __init__(self,world, load_file="save_bots/qbot.json"): self.world = world ValueBot.__init__(self,world, load_file) self.nturns = 0 self.pathfinder = None def get_reward(self, reward_state, ant): """ Hand-tuned rewards for a state. The RewardState reward_state tracks the following events for which you can tailor rewards: reward_state.food_eaten: Fraction of a food item this ant ate (between 0 and 1) reward_state.was_killed: boolean flag whether the ant died this turn reward_state.death_dealt: Fraction of responsibility this ant contributed to killing other ants (e.g., if 2 ants killed an enemy an, each would have death_dealt=1/2 reward_state.hill_razed: reward_state.hill_distance: Fraction, 1/x """ if self.state.get_visited(ant.location) == 0: explore_bonus = 1 else: explore_bonus = float(1)/(self.state.get_visited(ant.location)*100) print ":::::Reward Info::::" print "food_eaten: "+str(reward_state.food_eaten) print "was_killed: "+str(reward_state.was_killed) print "death_dealt: "+str(reward_state.death_dealt) print "hill_razed: "+str(reward_state.razed_hill) print "hill_distance: "+str(reward_state.hill_distance) print "friendly hill razed: "+str(reward_state.friendy_hill_razed) print "exploration bonus: "+str(explore_bonus) print "::::::::::::::::::::" reward = 0 reward += LIVING_REWARD reward += FOOD_REWARD*reward_state.food_eaten reward += DEATH_REWARD*reward_state.was_killed reward += KILL_REWARD*reward_state.death_dealt; reward += RAZED_REWARD*reward_state.razed_hill; reward += HILL_DISTANCE_REWARD*reward_state.hill_distance; reward += FRIENDLY_HILL_RAZED_REWARD*reward_state.friendy_hill_razed; reward += EXPLORE_BONUS*explore_bonus; return reward def set_pathfinder(self, pathfinder): self.pathfinder = pathfinder def avoid_collisions(self): """ Simple logic to avoid collisions. No need to touch this function. """ next_locations = {} for ant in self.world.ants: if ant.status == AntStatus.ALIVE: # Basic collision detection: don't land on the same square as another friendly ant. nextpos = self.world.next_position(ant.location, ant.direction) if nextpos in next_locations.keys(): ant.direction = 'halt' else: next_locations[nextpos] = ant.ant_id def do_turn(self): """ do_turn just does some bookkeeping and calls the update+explore/exploit loop for each living or just killed ant. You shouldn't need to modify this function. """ # Grid lookup resolution: size 10 squares if self.state == None: fog = int(math.sqrt(self.world.viewradius2)/2) self.state = GlobalState(self.world, visited_resolution=fog, resolution=fog) else: self.state.update() # explore or exploit and update values for every ant that's alive or was just killed for ant in self.world.ants: if ant.status == AntStatus.ALIVE or ant.previous_reward_events.was_killed: ant.direction = self.explore_and_exploit(ant) self.avoid_collisions() # record features for action taken so we can update when we arrive in the next state next turn for ant in self.world.ants: ant.prev_features = self.features.extract(self.world, self.state, ant.location, ant.direction) ant.prev_value = self.value(self.state,ant.location,ant.direction) def update_weights(self,alpha,discount,reward,maxval,prevval,features): """ Perform an update of the weights here according to the Q-learning weight update rule described in the homework handout. """ for i in range(len(self.weights)): self.weights[i] += alpha*(reward + discount*maxval - prevval)*features[i] def explore_and_exploit(self,ant): ''' Update weights and decide whether to explore or exploit here. Where all the magic happens. YOUR CODE HERE ''' actions = self.world.get_passable_directions(ant.location, AIM.keys()) random.shuffle(actions) if len(actions)==0: return 'halt' # if we have a newborn baby ant, init its rewards and quality fcns if 'prev_value' not in ant.__dict__: ant.prev_value = 0 ant.previous_reward_events = RewardEvents() ant.prev_features = self.features.extract(self.world, self.state, ant.location, actions[0]) return actions[0] # step 1, update Q(s,a) based on going from last state, taking # the action issued last round, and getting to current state R = self.get_reward(ant.previous_reward_events, ant) # step size. it's good to make this inversely proportional to the # number of features, so you don't bounce out of the bowl we're trying # to descend via gradient descent alpha = float(1) / (len(ant.prev_features)*ALPHA_DIVIDER) # totally greedy default value, future rewards count for nothing, do not want discount = DISCOUNT # should be max_a' Q(s',a'), where right now we are in state s' and the # previous state was s. You can use # self.value(self.state,ant.location,action) here max_next_value = 0 max_next_action = 'halt' for action in actions: val = self.value(self.state,ant.location,action) if max_next_value < val: max_next_value = val max_next_action = action # should be argmax_a' Q(s',a') #max_next_action = 'halt' # now that we have all the quantities needed, adjust the weights self.update_weights(alpha,discount,R,max_next_value,ant.prev_value,ant.prev_features) # step 2, explore or exploit? you should replace decide_to_explore with # something sensible based on the number of games played so far, self.ngames if self.ngames < EXPLORE_THRESHOLD: decide_to_explore = True else: if random.randint(0,(self.ngames-EXPLORE_THRESHOLD)/2) == 0: decide_to_explore = True else: decide_to_explore = False decide_to_explore = False if decide_to_explore: return actions[0] else: return max_next_action