class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env, learning_rate=0.6, discount_factor=0.35): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.alpha = learning_rate self.gamma = discount_factor self.state = None self.Q_values = self.initialize_Q_values() def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def initialize_Q_values(self, val=10.0): Q_values = {} for waypoint in ['left', 'right', 'forward']: for light in ['red', 'green']: for action in self.env.valid_actions: Q_values[((waypoint, light), action)] = val return Q_values def find_max_Q(self, state): action = None max_Q = 0.0 for a in self.env.valid_actions: Q = self.Q_values[(state, a)] if Q > max_Q: action = a max_Q = Q return (max_Q, action) def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = (self.next_waypoint, inputs['light']) # TODO: Select action according to your policy #action = random.choice(self.env.valid_actions) (Q, action) = self.find_max_Q(self.state) # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward next_waypoint = self.planner.next_waypoint() inputs = self.env.sense(self) state_prime = (next_waypoint, inputs['light']) (Q_prime, action_prime) = self.find_max_Q(state_prime) Q += self.alpha * (reward + self.gamma*Q_prime - Q) self.Q_values[(self.state, action)] = Q print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class QLearningAgent(Agent): """An agent that learns to drive in the smartcab world by using Q-Learning""" def __init__(self, env): super(QLearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.learner = QLearn(epsilon=1, alpha=0.3, gamma=0.6) def reset(self, destination=None): self.planner.route_to(destination) def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint) # TODO: Select action according to your policy action = self.learner.action(self.state) # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward next_inputs = self.env.sense(self) self.future_next_way_out = self.planner.next_waypoint() next_state = (next_inputs['light'], next_inputs['oncoming'], next_inputs['left'], self.future_next_way_out) #next_state = (next_inputs[0], next_inputs[1], next_inputs[3], self.planner.next_waypoint()) self.learner.learn(self.state, action, next_state, reward) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here #Actions self.A = ['forward', 'left', 'right', None] #all the actions available self.trial = 0 #Initialize Q table(light, oncoming, next_waypoint) self.Q={} for i in ['green', 'red']: #possible lights for j in [None, 'forward', 'left', 'right']: #possible oncoming_agent for k in ['forward', 'left', 'right']: #possible next_waypoints self.Q[(i,j,k)] = [1] * len(self.A) def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = (inputs['light'], inputs['oncoming'], self.next_waypoint) #Find the max Q value for the current state max_Q = self.Q[self.state].index(max(self.Q[self.state])) #assign action p = random.randrange(0,5) epsilon = 1.5 # small probability to act randomly if p<epsilon: action = random.choice(self.env.valid_actions) else: action = self.A[max_Q] # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward gamma = 0.30 alp_tune =500 # tunning parameter alpha = 1/(1.1+self.trial/alp_tune) # decay learning rate self.trial = self.trial+1 #get the next action state Q(s',a') next_inputs = self.env.sense(self) next_next_waypoint = self.planner.next_waypoint() next_state = (next_inputs['light'],next_inputs['oncoming'], next_next_waypoint) #Update Q Table self.Q[self.state][self.A.index(action)] = \ (1-alpha)*self.Q[self.state][self.A.index(action)] + \ (alpha * (reward + gamma * max(self.Q[next_state])))
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint valid_waypoints = ['forward','right','left'] valid_lights = ['red','green'] valid_actions = ['forward','right','left', None] self.Q_hat={} states = [(next_waypoint,light,oncoming,right,left) for next_waypoint in valid_waypoints for light in valid_lights for oncoming in valid_actions for right in valid_actions for left in valid_actions] states.append('Destination') for state in states: self.Q_hat[state] = {} for action in valid_actions: self.Q_hat[state][action] = random.random() def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) if self.next_waypoint == None: self.state = 'Destination' else: self.state = (self.next_waypoint, inputs['light'],inputs['oncoming'],inputs['right'],inputs['left']) # TODO: Select action according to your policy valid_actions = ['forward','right','left', None] epsilon = 0.1 #0.01 best_action = max(self.Q_hat[self.state],key=self.Q_hat[self.state].get) random_action = choice(valid_actions) action = choice([best_action, random_action],p=[1-epsilon,epsilon]) # Execute action and get reward reward = self.env.act(self, action) # Learn policy based on state, action, reward new_next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator new_inputs = self.env.sense(self) new_state = (new_next_waypoint, new_inputs['light'],new_inputs['oncoming'],new_inputs['right'],new_inputs['left']) alpha = 0.5 #opt 0.7 gamma = 0.5 #opt 0.1 max_Qhat_ahat = max(self.Q_hat[new_state].values()) self.Q_hat[self.state][action] = (1-alpha)*self.Q_hat[self.state][action]+alpha*(reward+gamma*max_Qhat_ahat) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env, alpha, q_initial): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.policy = {} self.trip_log = [] self.trip = None self.alpha = alpha self.q_initial = q_initial def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required if self.trip: self.trip_log.append(self.trip) self.trip = {} self.trip['Deadline'] = self.env.get_deadline(self) self.trip['Reward'] = 0 self.trip['Penalty'] = 0 def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state inputs.pop('right') self.state = inputs self.state['waypoint'] = self.next_waypoint # TODO: Select action according to your policy state_hash = hashabledict(frozenset(self.state.iteritems())) q = self.policy.get(state_hash, { None: self.q_initial, 'forward': self.q_initial, 'left': self.q_initial, 'right': self.q_initial, }) action = max(q.iteritems(), key=operator.itemgetter(1))[0] # Execute action and get reward reward = self.env.act(self, action) # Update trip stats self.trip['Reward'] += reward self.trip['Remaining'] = self.env.get_deadline(self) self.trip['Success'] = self.planner.next_waypoint() == None if reward < 0: self.trip['Penalty'] += reward # TODO: Learn policy based on state, action, reward q[action] = (1 - self.alpha) * q[action] + self.alpha * reward self.policy[state_hash] = q
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.QLearning = QLearning(epsilon=0.3, alpha=0.95, gamma=0.9) # self.QLearning = pickle.load(open('./QLearning.pkl')) def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def is_action_okay(self, inputs): action_okay = True if self.next_waypoint == 'right': if inputs['light'] == 'red' and inputs['left'] == 'forward': action_okay = False elif self.next_waypoint == 'forward': if inputs['light'] == 'red': action_okay = False elif self.next_waypoint == 'left': if inputs['light'] == 'red' or (inputs['oncoming'] == 'forward' or inputs['oncoming'] == 'right'): action_okay = False return action_okay def update(self, t): # Gather states self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state action_okay = self.is_action_okay(inputs) self.state = (action_okay, self.next_waypoint, deadline < 3) # self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint) # TODO: Select action according to your policy action = self.QLearning.policy(self.state) # Execute action and get reward reward = self.env.act(self, action) # Gather next states next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator next_inputs = self.env.sense(self) next_deadline = self.env.get_deadline(self) next_action_okay = self.is_action_okay(next_inputs) next_state = (next_action_okay, next_waypoint, next_deadline < 3) # next_state = (next_inputs['light'], next_inputs['oncoming'], next_inputs['left'], next_waypoint) # TODO: Learn policy based on state, action, reward self.QLearning.update_QTable(self.state, action, reward, next_state)
class BasicLearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(BasicLearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.last_reward = 0 self.last_action = None self.last_state = None self.state = None self.total_reward = 0 self.deadline = self.env.get_deadline(self) self.actions = ['forward', 'left', 'right', None] self.reached_destination = 0 self.penalties = 0 self.movements = 0 def tuple_state(self, state): State = namedtuple("State", ["light", "next_waypoint"]) return State(light = state['light'], next_waypoint = self.planner.next_waypoint()) def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.last_reward = 0 self.last_action = None self.last_state = None self.state = None self.total_reward = 0 def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = self.tuple_state(inputs) # TODO: Select action according to your policy action = random.choice(self.actions) # Execute action and get reward reward = self.env.act(self, action) if reward >= 10: self.reached_destination += 1 if reward < 0: self.penalties += 1 self.movements += 1 # TODO: Learn policy based on state, action, reward self.last_action = action self.last_state = self.state self.last_reward = reward self.total_reward += reward
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.actions = ['forward', 'left', 'right',None] self.q_table = Counter() self.alpha = 0.5 self.gamma = 0.2 def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def get_best_action(self, state): best_profit = -999 best_action = None for a in self.actions: profit = self.q_table[(state, a)] if profit > best_profit: best_profit = profit best_action = a return best_action, best_profit def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = (inputs['light'],inputs['oncoming'], inputs['left'], self.next_waypoint) if randint(0,10) < 3: action = self.actions[randint(0, 3)] else: action, _ = self.get_best_action(self.state) reward = self.env.act(self, action) next_inputs = self.env.sense(self) next_next_waypoint = self.planner.next_waypoint() next_state = (next_inputs['light'],next_inputs['oncoming'], next_inputs['left'], next_next_waypoint) _, best_profit = self.get_best_action(next_state) self.q_table[(self.state, action)] = (1-self.alpha)*self.q_table[(self.state, action)] + (self.alpha * (reward + self.gamma * best_profit)) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.debugOn = False self.reporter = Reporter(driver_type, self.env.valid_actions) if driver_type=='random': self.driver = RandomDriver(self.env.valid_actions) elif driver_type=='greedy': self.driver = GreedyDriver(self.env.valid_actions) elif driver_type=='greedy2': self.driver = Greedy2Driver(self.env.valid_actions) elif driver_type=='smart': self.driver = SmartDriver(self.env.valid_actions, self.reporter) elif driver_type=='smart2': self.driver = Smart2Driver(self.env.valid_actions, self.reporter) elif driver_type=='smart3': self.driver = Smart3Driver(self.env.valid_actions, self.reporter) else: self.driver = Smart2Driver(self.env.valid_actions, self.reporter) def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.reporter.reset(destination) def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = State.make(self.next_waypoint, inputs) # TODO: Select action according to your policy action = self.driver.decide(self.state) # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward newWaypoint = self.planner.next_waypoint() newState = State.make(newWaypoint, self.env.sense(self)) self.driver.learn(t, deadline, self.state, action, reward, newState) location = self.env.agent_states[self]['location'] if self.debugOn: print "LearningAgent.update(): deadline={} inputs={} action={} reward={} location={}".format(deadline, inputs, action, reward, location) # [debug] self.reporter.update(t, deadline, self.state, action, reward, location, self.driver)
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.Q = {} self.pi = {} self.alpha = 1 self.gema = 0 self.ACTIONS = [None, 'forward', 'left', 'right'] def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = inputs self.state['next_waypoint'] = self.next_waypoint self.state = tuple(self.state.items()) # TODO: Select action according to your policy action = random.choice(self.ACTIONS) if self.state not in self.pi else self.pi[self.state] # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward next_state = self.env.sense(self) next_state['next_waypoint'] = self.planner.next_waypoint() next_state = tuple(next_state.items()) next_action = random.choice(self.ACTIONS) if next_state not in self.pi else self.pi[next_state] sta_act = (self.state, action) self.Q[sta_act] = (1 - self.alpha) * self.Q.get(sta_act, 0) + self.alpha*(reward + self.gema*self.Q.get((next_state, next_action), 0)) acts_Q = [self.Q.get((self.state, act)) for act in self.ACTIONS] if None not in acts_Q: max_Q = max(acts_Q) max_acts = [act for act in self.ACTIONS if max_Q==self.Q.get((self.state, act))] self.pi[self.state] = random.choice(max_acts) #print self.Q #print #print self.pi print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class QLearningAgent(Agent): """An agent that learns to drive in the smartcab world by using Q-Learning""" def __init__(self, env): super(QLearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.possible_actions= Environment.valid_actions self.ai = QLearn(epsilon=.05, alpha=0.1, gamma=0.9) def reset(self, destination=None): self.planner.route_to(destination) def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) inputs = inputs.items() deadline = self.env.get_deadline(self) self.state = (inputs[0],inputs[1],inputs[3],self.next_waypoint) action = self.ai.Q_move(self.state) # Execute action and get reward reward = self.env.act(self, action) inputs2 = self.env.sense(self) inputs2 = inputs2.items() next_state = (inputs2[0],inputs2[1],inputs2[3],self.next_waypoint) self.ai.Q_post(self.state, action, next_state, reward) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): # sets self.env = env, state = None, next_waypoint = None, and a default color super(LearningAgent, self).__init__(env) self.color = 'red' # override color # simple route planner to get next_waypoint self.planner = RoutePlanner(self.env, self) # TODO: Initialize any additional variables here def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs # # from route planner, also displayed by simulator self.next_waypoint = self.planner.next_waypoint() inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = self.next_waypoint, inputs['light'] # self.state = self.next_waypoint, (inputs['oncoming'] == None or inputs['light'] == 'green') # TODO: Select action according to your policy # Define actions actions = [None, 'forward', 'left', 'right'] # QUESTION 1- select random action action = random.choice(actions) # Execute action and get reward reward = self.env.act(self, action)
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.alpha = 0.2 self.Qtable = {} def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.state = None self.reward = 0 def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state # Inform Driving Agent self.state = (inputs['light'], inputs['oncoming'], inputs['left'], inputs['right'], self.next_waypoint) # TODO: Select action according to your policy valid_actions = [None, 'forward', 'left', 'right'] # Random Pick Actions ### action = random.choice(valid_actions) # Get all Q values for all state all_actions = {action: self.Qtable.get((self.state, action), 0) for action in valid_actions} # Find action that maximizes Q values best_actions = [action for action in valid_actions if all_actions[action] == max(all_actions.values())] # Return Best Action action = random.choice(best_actions) # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward # To simplify the problem, I set gamma equals 0. self.Qtable[(self.state, action)] = \ (1 - self.alpha) * self.Qtable.get((self.state, action), 0) + self.alpha * reward print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state # TODO: Select action according to your policy action = None # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward print("LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.q_matrix = {} self.alpha = 0.95 self.gamma = 0.05 self.explore = 0.05 self.state = None def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs deadline = self.env.get_deadline(self) state = self.observe_state() self.state = state knowledge = self.knowledge_from_state(state) # TODO: Select action according to your policy if random() < self.explore: action = choice(OPTIONS) else: action = sorted(knowledge.items(), key=lambda x: x[1], reverse=True)[0][0] old_value = knowledge[action] # Execute action and get reward reward = self.env.act(self, action) new_state = self.observe_state() new_state_knowledge = self.knowledge_from_state(new_state) self.q_matrix[state][action] = (1-self.alpha) * old_value +\ self.alpha * (reward + self.gamma * max(new_state_knowledge.values())) #print "LearningAgent.update(): deadline = {}, state = {}, action = {}, reward = {}".format(deadline, state, action, reward) # [debug] def knowledge_from_state(self, state): """ Gets what we know about the given state """ if not state in self.q_matrix: self.q_matrix[state] = {option: 0 for option in OPTIONS} return self.q_matrix[state] def observe_state(self): """ Observe the current state """ inputs = self.env.sense(self) self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator return (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state # location, heading, delta, deadline, oncoming, traffic light state inputs['waypoint'] = self.next_waypoint self.state = tuple(sorted(inputs.items())) # performs the actual update # TODO: Select action according to your policy actions = [None,'forward','left','right'] action = actions[random.randint(0,3)] # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class BasicAgent(Agent): def __init__(self, env): super(BasicAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.color = 'red' # override color def reset(self, destination=None): self.planner.route_to(destination) def update(self, t): self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) action = random.choice(self.env.valid_actions) reward = self.env.act(self, action) light = inputs['light'] oncoming = inputs['oncoming'] left = inputs['left'] right = inputs['right'] #Update the current observed state self.state = (light, oncoming, left, right, self.next_waypoint) print "Basic.update(): next move = {}".format(action) # [debug]
class LearningAgent(Agent): def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.next_waypoint = None self.total_reward = 0 def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.state = None self.next_waypoint = None def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Select action according to your policy loc, hed = self.get_my_location() action = self.get_next_waypoint_given_location( loc, hed) action_okay = self.check_if_action_is_ok(inputs) if not action_okay: action = None # Execute action and get reward reward = self.env.act(self, action) self.total_reward += reward #TODO: Learn policy based on state, action, reward print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here def reset(self, destination=None): self.planner.route_to(destination) def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = inputs['location'] self.heading=inputs['heading'] action = random.choice(Environment.valid_actions) reward = self.env.act(self, action) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.successes=0 self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.learner=Q_Learner() self.mapper= StateMapper(self.learner) self.actionChooser= QValueRandomMixActionChooser(self.learner) self.negative_rewards=0 def reset(self, destination=None): self.learner.prior_state=None self.actionChooser.first=True self.planner.route_to(destination) def update(self, t): # Gather inputs learner= self.learner self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) if learner.prior_state==None: learner.prior_state= self.mapper.mapDataToState(self.next_waypoint,inputs,deadline) self.state=learner.prior_state[1] else: learner.update() learner.prior_state=learner.current_state # Select action according to your policy action= self.actionChooser.chooseAction() learner.action=action # Execute action and get reward learner.reward = self.env.act(self, action) if learner.reward<0: self.negative_rewards+=learner.reward self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator location = self.env.agent_states[self]["location"] destination = self.env.agent_states[self]["destination"] inputs = self.env.sense(self) learner.current_state= self.mapper.mapDataToState(self.next_waypoint,inputs,deadline) self.state=learner.current_state[1] #keep track of how often we were successful for later evaluation if location==destination: self.successes= self.successes+1
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.prevReward = 0 self.prevAction = None def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.state = None self.prevReward = 0 self.prevAction = None def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state state = None validActions = [] if inputs['light'] == 'red': if inputs['oncoming'] != 'left': validActions = ['right', None] else: if inputs['oncoming'] == 'forward': validActions = [ 'forward','right'] else: validActions = ['right','forward', 'left'] # TODO: Select action according to your policy if validActions != []: action = random.choice(validActions) else: action = random.choice(Environment.valid_actions) # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward if action != None: if reward > self.prevReward: self.state = (inputs,self.next_waypoint,deadline) self.prevAction = action self.prevReward = reward else: self.state = (inputs,self.next_waypoint,deadline) self.prevAction = action self.prevReward = reward print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): # sets self.env = env, state = None, next_waypoint = None, and a default color super(LearningAgent, self).__init__(env) self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.qtable = defaultdict(dict) self.epsilon = .4 # Randomness factor self.alpha = .4 # Learning rate. def reset(self, destination=None): # Prepare for a new trip; reset any variables here, if required self.planner.route_to(destination) def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # Update state self.state = self.define_state(inputs, self.next_waypoint) # Select action according policy action = self.select_action() # Execute action and get reward reward = self.env.act(self, action) # Learn policy based on state, action, reward self.update_qtable(action, reward) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug] def select_action(self): if random.random() < self.epsilon or self.state not in self.qtable: action = random.choice(self.env.valid_actions) else: action = max(self.qtable[self.state], key=self.qtable[self.state].get) if self.state not in self.qtable: for a in self.env.valid_actions: # init action:reward dict. self.qtable[self.state][a] = 0 return action def update_qtable(self, action, reward): old_q = self.qtable[self.state].get(action, 0) self.qtable[self.state][action] = ((1 - self.alpha) * old_q) + (self.alpha * reward) # Decay alpha and epsilon until it reaches 0.1 and 0.0, respectively. self.alpha = max(0.001, self.alpha - 0.002) self.epsilon = max(0.0, self.epsilon - 0.002) @staticmethod def define_state(inputs, next_waypoint): return tuple(inputs.values()) + (next_waypoint, )
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.Qtable = {} self.alpha = 0.8 self.gamma = 0.6 def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = (self.next_waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right']) old_state = self.state # TODO: Select action according to your policy if (old_state, None) not in self.Qtable.keys(): self.Qtable[(old_state, None)] = 0 if (old_state, 'left') not in self.Qtable.keys(): self.Qtable[(old_state, 'left')] = 0 if (old_state, 'forward') not in self.Qtable.keys(): self.Qtable[(old_state, 'forward')] = 0 if (old_state, 'right') not in self.Qtable.keys(): self.Qtable[(old_state, 'right')] = 0 max_action = max(self.Qtable[(old_state, None)], self.Qtable[(old_state, 'left')],self.Qtable[(old_state, 'forward')],self.Qtable[(old_state, 'right')]) action = random.choice([ i for i in ['left', 'forward', 'right', None] if self.Qtable[(old_state, i)] == max_action]) # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward #self.next_waypoint = self.planner.next_waypoint() inputs = self.env.sense(self) self.state = (self.next_waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right']) new_state = self.state if (new_state, None) not in self.Qtable.keys(): self.Qtable[(new_state, None)] = 0 if (new_state, 'left') not in self.Qtable.keys(): self.Qtable[(new_state, 'left')] = 0 if (new_state, 'forward') not in self.Qtable.keys(): self.Qtable[(new_state, 'forward')] = 0 if (new_state, 'right') not in self.Qtable.keys(): self.Qtable[(new_state, 'right')] = 0 self.Qtable[(old_state, action)] = (1 - self.alpha) * self.Qtable[(old_state, action)] + self.alpha *(reward + self.gamma * max(self.Qtable[(new_state, i)] for i in [None, 'left', 'forward', 'right']))
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.actions = [None, 'forward', 'left', 'right'] self.learning_rate = 0.3 self.state = None self.q = {} self.trips = 0 def reset(self, destination=None): self.planner.route_to(destination) # Prepare for a new trip; reset any variables here, if required self.trips += 1 if self.trips >= 100: self.learning_rate = 0 def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # Update state state = (inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'], self.next_waypoint) self.state = state # Lazy initialization of Q-values for action in self.actions: if (state, action) not in self.q: self.q[(state, action)] = 1 # Select action according to the policy probabilities = [self.q[(state, None)], self.q[(state, 'forward')], self.q[(state, 'left')], self.q[(state, 'right')]] # Use the softmax funtion so that the values actually behave like probabilities probabilities = np.exp(probabilities) / np.sum(np.exp(probabilities), axis=0) adventurousness = max((100 - self.trips) / 100, 0) if random.random() < adventurousness: action = np.random.choice(self.actions, p=probabilities) else: action = self.actions[np.argmax(probabilities)] # Execute action and get reward reward = self.env.act(self, action) # Learn policy based on state, action, reward self.q[(state, action)] = self.learning_rate * reward + (1 - self.learning_rate) * self.q[(state, action)] #print("LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)) # [debug] def get_state(self): return self.state
class QLearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(QLearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.total_reward = 0 self.next_waypoint = None self.QLearner = QAlgorithm(epsilon=0.03, alpha = 0.1, gamma = 0.9) self.next_state = None self.time_step = 1.0 def get_decay_rate(self, t): #Decay rate for epsilon return 1.0 / float(t) def reset(self, destination=None): self.planner.route_to(destination) self.time_step = 1.0 # TODO: Prepare for a new trip; reset any variables here, if required ''' self.previous_action = None self.next_state = None self.total_reward = 0 self.next_waypoint = None self.QLearner = QAlgorithm(epsilon=0.03, alpha = 0.1, gamma = 0.9) self.next_state = None ''' def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) #env_states = self.env.agent_states[self] self.time_step += 1 epsilon = self.get_decay_rate(self.time_step) self.QLearner.set_eps(epsilon) self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint) #self.state = (inputs['light'], self.next_waypoint) action = self.QLearner.q_move(self.state) # Execute action and get reward reward = self.env.act(self, action) self.total_reward += reward new_inputs = self.env.sense(self) #env_states = self.env.agent_states[self] new_deadline = self.env.get_deadline(self) self.next_state = (new_inputs['light'], new_inputs['oncoming'], new_inputs['left'], self.next_waypoint) #self.next_state = (new_inputs['light'], self.next_waypoint) # TODO: Learn policy based on state, action, reward self.QLearner.q_future_reward(self.state, action, self.next_state, reward) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class BasicAgent(Agent): """A basic agent upon which to build learning agents.""" def __init__(self, env): super(BasicAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.qvals = {} # mapping (state, action) to q-values self.time = 0 # number of moves performed self.possible_actions = (None, 'forward', 'left', 'right') self.n_dest_reached = 0 # number of destinations reached self.last_dest_fail = 0 # last time agent failed to reach destination self.sum_time_left = 0 # sum of time left upon reaching destination over all trials self.n_penalties = 0 # number of penalties incurred self.last_penalty = 0 # last trial in which the agent incurred in a penalty def reset(self, destination=None): self.planner.route_to(destination) def best_action(self, state): """ Return a random action (other agents will have different policies) """ return random.choice(self.possible_actions) def update_qvals(self, state, action, reward): """ Keeps track of visited (state, action) pairs. (in other agents will use reward to update the mapping from (state, action) pairs to q-values) """ self.qvals[(state, action)] = 0 def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # update time self.time += 1 # Update state self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint) # Pick an action action = self.best_action(self.state) # Execute action and get reward reward = self.env.act(self, action) if reward < 0: self.n_penalties += 1 # Update the q-value of the (state, action) pair self.update_qvals(self.state, action, reward) # print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.acts = ['left', 'right', 'forward', 'stay'] df_tmp = pd.DataFrame(0*np.ones((2, 4)), index=['red', 'green'], columns=self.acts) self.Q = df_tmp self.deadline = 0 def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) self.deadline = deadline # TODO: Update state self.state = inputs['light'] # in this case the state is simply defined by the color oof the traffic light at the intersection # TODO: Select action according to your policy # Q-Learner act_opt = self.Q.ix[self.state].argmax() if act_opt=='stay': act_opt=None # print "Current state = %s, qlearner action = %s" % (self.state, act_opt) action = act_opt # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward # Maximize Q conditional on the state we are in = S q_S = self.Q.ix[self.state] # Q conditional on state S q_S_mx = q_S[q_S==q_S.max()] # just in case there are multiple values and the action that maximizes Q(S, a) is not unique ix_mx = np.random.randint(0,len(q_S_mx)) # pick one of the equally qlearner actions q_S_mx = q_S_mx.iloc[ix_mx] # Update Q act_tmp = action if action !=None else 'stay' global alpha, alpha0, gamma alpha *= alpha0 self.Q.ix[self.state, act_tmp] = \ (1-alpha)*self.Q.ix[self.state, act_tmp] + \ alpha*(reward + gamma*q_S_mx) # note: we update the action that has been actually taken, but we update the contribution to Q(S, a) by the action that could have been taken had we behaved qlearnerly
class QLearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(QLearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.Q = {} self.learning_rate = 0.9 self.states = None self.possible_actions = [None, 'forward', 'left', 'right'] def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state # state = (self.next_waypoint,inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'],deadline) state = (self.next_waypoint,inputs['light'], inputs['oncoming'], inputs['right'], inputs['left']) # state = (self.next_waypoint,inputs['light']) self.state = state for action in self.possible_actions: if(state, action) not in self.Q: self.Q[(state, action)] = 20 # TODO: Select action according to your policy Q_best_value = [self.Q[(state, None)], self.Q[(state, 'forward')], self.Q[(state, 'left')], self.Q[(state, 'right')]] # Softmax Q_best_value = np.exp(Q_best_value) / np.sum(np.exp(Q_best_value), axis=0) #action = np.random.choice(self.possible_actions, p=probabilities) action = self.possible_actions[np.argmax(Q_best_value)] # Execute action and get reward reward = self.env.act(self, action) if reward == -10.0 : print "Invalid move" if reward == -0.5 : print "valid, but is it correct?" # TODO: Learn policy based on state, action, reward # self.Q[(state,action)] = reward + self.learning_rate * self.Q[(state,action)] self.Q[(state,action)] = reward * self.learning_rate + ( 1 - self.learning_rate ) * self.Q[(state,action)]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env, learning_rate=0.78, discount_factor=0.36): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.Q = {} self.discount_factor = discount_factor self.learning_rate = learning_rate self.actions = Environment.valid_actions self.oldAction = None def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.oldAction=None def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state #------------------------------------------------ #action = np.random.choice(self.actions) #------------------------------------------------ # TODO: Select action according to your policy current_state = tuple(inputs.values() + [self.next_waypoint]) self.state = (inputs, self.next_waypoint) if current_state not in self.Q: self.Q[current_state] = [3,3,3,3] Q_max = self.Q[current_state].index(np.max(self.Q[current_state])) # Execute action and get reward action = self.actions[Q_max] reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward if t!=0: oldvalue = self.Q[self.oldAction[0]][self.oldAction[1]] newvalue = ((1 - self.learning_rate)*oldvalue)+(self.learning_rate * (self.oldAction[2] + self.discount_factor * self.Q[current_state][Q_max])) self.Q[self.oldAction[0]][self.oldAction[1]] = newvalue self.oldAction = (current_state, self.actions.index(action), reward) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class RandomAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(RandomAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.availableAction = ['forward', 'left', 'right',None] self.goal=0 self.steps=0 self.features=[] self.deadline=[] self.total_reward=[0] def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required #print"RESET, Final state:\n", self.state try: if self.deadline[len(self.features)-1] >0: #deadline less than zero self.goal+=1 #FIXME - order print "PASS! {} steps to goal,Goal reached {} times out of {}!".format( self.deadline[len(self.features)-1],self.goal,len(self.features)) else: print "FAIL! {} steps to goal,Goal reached {} times out of {}!".format( self.deadline[len(self.features)-1],self.goal,len(self.features)) pass except: print "Trial 0 - Goal reached {} times out of {}!".format(self.goal,len(self.features)) pass print "----------------------------------------------------------" self.features.append({}) self.deadline.append(None) self.total_reward.append(0) self.steps=0 def update(self, t): # Gather inputs self.steps+=1 self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) #self.deadline[len(self.features)] = self.env.get_deadline(self) self.state=inputs self.features[len(self.features)-1][self.steps]=inputs self.deadline[len(self.deadline)-1] = self.env.get_deadline(self) # TODO: Select action according to your policy action = self.availableAction[random.randint(0,3)] # Execute action and get reward reward = self.env.act(self, action) self.lastReward=reward # TODO: Learn policy based on state, action, reward self.total_reward[len(self.total_reward)-1] =self.total_reward[len(self.total_reward)-1]+reward
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict() # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed #self.a = 0.01 self.t = 0 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 self.t = self.t+1 if testing: self.epsilon = 0; self.alpha = 0; else: #self.epsilon = self.epsilon - 0.05 #self.epsilon = 0.98**self.t #self.epsilon =1.0/(self.t**2) #self.epsilon = math.exp(-(0.01*self.t)) self.epsilon = math.cos(0.003 * self.t) #self.epsilon = 1; return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense(self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent state = (waypoint, inputs['light'], inputs['left'], inputs['right'], inputs['oncoming']) #if self.learning: #self.createQ(state) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state ## My example for easy understanding #Q = {'left':10.0, 'right':11.0, 'forward':-5.0, None :0.0 } #max(Q)---> right #maxQ = -1000.0 #for action in self.Q[state]: # if maxQ < self.Q[state][action]: # maxQ = self.Q[state][action] #maxQ = max(self.Q[state][action] for action in self.valid_actions) maxQ = max(self.Q[state].values()) #max(self.Q[state]) return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 #if self.learning: # if state not in self.Q: # self.Q[state] = {'left':0.0, 'right':0.0, 'forward':0.0, None :0.0 } if (self.learning) and (state not in self.Q): self.Q[state] = {} for action in self.valid_actions: self.Q[state][action] = 0.0 #self.Q[state] = self.Q.get(state, {None : 0.0 , 'left' : 0.0 , 'right' : 0.0 , 'forward' : 0.0 }) return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state if self.learning: if self.epsilon > random.random(): action = random.choice(self.valid_actions) else: maxQ = self.get_maxQ(state) action = random.choice([action for action in self.valid_actions if self.Q[state][action]==maxQ]) #maxActions = [] #for action_key in self.Q[state]: # if self.Q[state][action_key] == maxQ: # maxActions.append(action_key) #action = random.choice(maxActions) else: action = random.choice(self.valid_actions) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') #inputs_new = self.env.sense(self) #state_new = (self.planner.next_waypoint(), inputs_new['light'], inputs_new['oncoming'], inputs_new['left'], inputs_new['right']) #self.q_table[self.state][action] = (1 - alpha) * self.q_table[self.state][action] + alpha * (reward + gamma * max(self.q_table[state_new])) if self.learning: self.Q[state][action] = ( 1- self.alpha) * self.Q[state][action] + self.alpha * reward return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env, gamma, alpha): # sets self.env = env, state = None, next_waypoint = None, and a default color super(LearningAgent, self).__init__(env) self.color = 'red' # override color # simple route planner to get next_waypoint self.planner = RoutePlanner(self.env, self) # Initialize additional variables and QLearning here self.actions = ['forward', 'left', 'right', None] self.states = [] self.q_learn = QLearning(self.actions, states=self.states, gamma=gamma, alpha=alpha) self.p_action = None self.p_reward = None def reset(self, destination=None): self.planner.route_to(destination) # Prepare for a new trip; reset any variables here, if required def get_state_code(self, inputs): dic = { 'red': 1, 'green': 2, 'forward': 1, 'left': 2, 'right': 3, None: 4 } state_code = 0 for i in xrange(len(inputs)): state_code += dic[inputs[i]] * 10**i return state_code def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint( ) # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # Get previous state and action taken in state p_state, p_action, p_reward = self.state, self.p_action, self.p_reward # Get current state state = self.get_state_code([self.next_waypoint] + inputs.values()) # if state is observed for the first time add it to Q and states list self.q_learn.add_new_state(state) # To avoid crash at first run when p_state == p_action == p_reward == None if p_state is not None: # Learn policy based on state, action, reward self.q_learn.update_Q(p_reward=p_reward, p_state=p_state, p_action=p_action, c_state=state) # Select action according to your policy action = self.q_learn.get_best_action(state) # Execute action and get reward reward = self.env.act(self, action) # sets previous state and action based for next state self.p_action, self.state, self.p_reward = action, state, reward global total_actions, total_rewards total_actions += 1 total_rewards += reward print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format( deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 if testing: self.alpha = 0 self.epsilon = 0 else: self.epsilon = max( self.epsilon - 0.01, 0.0) # Make epsilon smaller as the number of trials increases return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent # When learning, check if the state is in the Q-table # If it is not, create a dictionary in the Q-table for the current 'state' # For each action, set the Q-value for the state-action pair to 0 state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right']) return state def get_maxQ_action(self, state): """ The get_max_Q_action function is called when the agent is asked to find the action with the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Find the action that gives the maximum Q-value of all actions for a given state state_dict = self.Q[state] # find the max Q value maxQ = max(state_dict.values()) # choose a random action out of all actions with the highest Q return random.choice( [item[0] for item in state_dict.items() if item[1] >= maxQ]) def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if (self.learning and state not in self.Q): self.Q[state] = {action: 0.0 for action in self.valid_actions} return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state action = random.choice(self.valid_actions) if ( not self.learning) else (random.choice(self.valid_actions) if (random.uniform(0, 1) < self.epsilon) else self.get_maxQ_action(state)) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: self.Q[state][action] = ( 1 - self.alpha) * self.Q[state][action] + self.alpha * reward return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor # Set any additional class parameters as needed self.t = 0 random.seed(42) def linear_decay(self): """ A linear decaying function for the initial Q-Learning implementation """ return self.epsilon - 0.05 def optimized_decay(self): """ A optimized version of the decaying function """ return math.pow(math.e, -.005 * self.t) def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 self.t += 1 if testing: self.epsilon = 0 self.alpha = 0 else: self.epsilon = self.optimized_decay() return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline # Set 'state' as a tuple of relevant data for the agent return (inputs['oncoming'], inputs['left'], inputs['light'], waypoint) def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ # Calculate the maximum Q-value of all actions for a given state qs = self.Q[state] return max(qs[action] for action in self.valid_actions) def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning and state not in self.Q: self.Q[state] = {action: 0.0 for action in self.valid_actions} return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state if not self.learning: action = random.choice(self.valid_actions) else: if self.epsilon > random.random(): action = random.choice(self.valid_actions) else: maxQ = self.get_maxQ(state) best_actions = [ action for action, q in self.Q[state].iteritems() if maxQ == q ] action = random.choice(best_actions) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: q = self.Q[state][action] self.Q[state][action] = q + self.alpha * (reward - q) return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.testing = False self.t = 0.0 # t - number of tries def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 self.testing = testing self.t = self.t + 1.0 if self.testing: self.epsilon = 0.0 self.alpha = 0.0 else: # First Q-learning implementation: epsilon = epsilon - 0.05 # self.epsilon = self.epsilon - 0.05 # Improved Q-learning implementation: epsilon = 1 / (t*t) self.epsilon = 1.0 / (self.t * self.t) # 1 / t*t return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # NOTE : you are not allowed to engineer eatures outside of the inputs available. # Because the aim of this project is to teach Reinforcement Learning, we have placed # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation. # With the hand-engineered features, this learning process gets entirely negated. # Set 'state' as a tuple of relevant data for the agent state = (waypoint, inputs) #None return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state # recommended method per code review maxQ = max(self.Q[str(state)].values()) """ maxQ = 0.0 state_str = str(state) if state_str in self.Q.keys(): adict = self.Q[state_str] print("Q-list: state_str ", state_str) for key in adict.keys(): value = adict[key] print("Q-list: action = ", key, " ,value = ", value) if value > maxQ: maxQ = value else: print("state_str = ", state_str, " not in Q list!") """ #maxQ = None print("max Q value = ", maxQ) return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 # Code review comment: Make sure your Q-Table is ONLY initialized when Learning == True if (not self.learning): return # convert state to string state_str = str(state) if state_str in self.Q.keys(): return else: dict_item = dict() for action in self.valid_actions: dict_item[action] = 0.0 self.Q[state_str] = dict_item return def getQ(self, state, action): """ Yuefeng add this function for calculating running average of rewards. """ qvalue = 0.0 state_str = str(state) if state_str in self.Q.keys(): vdict = self.Q[state_str] if action in vdict.keys(): qvalue = vdict[action] else: print("getQ: action not in vdict!!!") else: print("getQ: state not in self.Q!!!") return qvalue def setQ(self, state, action, qvalue): """ Yuefeng add this function for calculating running average of rewards and then updating Q value. """ state_str = str(state) if state_str in self.Q.keys(): vdict = self.Q[state_str] if action in vdict.keys(): vdict[action] = qvalue else: print("setQ: action not in vdict!!!") else: print("setQ: state not in self.Q!!!") return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie". state_str = str(state) action_index = random.randint(0, 3) if (not self.learning): action = self.valid_actions[action_index] else: random_number = random.uniform(0.0, 1.0) if random_number <= self.epsilon: action = self.valid_actions[action_index] print("exploration, action = ", action) else: # recommended method per code review best_actions = [ action for action in self.valid_actions if self.Q[str( state)][action] == max(self.Q[str(state)].values()) ] action = random.choice(best_actions) """ maxQ = self.get_maxQ(state) best_actions = [] dict_item = self.Q[state_str] for action in self.valid_actions: qvalue = dict_item[action] if qvalue >= maxQ: best_actions.append(action) number_of_actions = len(best_actions) if number_of_actions < 1: print("choose_action error: best actions list is empty!") action = None elif number_of_actions == 1: action = best_actions[0] print("only one best action = ", action) else: best_action_index = random.randint(0, number_of_actions - 1) action = best_actions[best_action_index] print("multiple best actions, randomly selected action = ", action) """ return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives a reward. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') state_str = str(state) if self.learning: # get the previous Q value for the pair of (state, action) qvalue = self.getQ(state, action) print("current Q value = ", qvalue) # get alpha alpha = self.alpha # calculate new Q value new_qvalue = (1.0 - alpha) * qvalue + alpha * reward print("new Q value = ", new_qvalue) # update Q value self.setQ(state, action, new_qvalue) else: None return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.cnt = 1 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 # linear # self.epsilon -= 0.05 # e^t # self.epsilon = math.exp(-.08*self.cnt) # # 1/t^2 # self.epsilon = 100 * math.pow((1/self.cnt), 2) # # a^t # self.epsilon = math.pow(.991, self.cnt) # # self.alpha = 1 - math.pow((1/self.cnt), 2) # cos if self.cnt < 300: self.epsilon = .6 + .2 * math.cos(.6 * self.cnt) else: self.epsilon = math.pow(.991, self.cnt) # Update additional class parameters as needed self.cnt += 1 # If 'testing' is True, set epsilon and alpha to 0 if testing: self.epsilon = 0 self.alpha = 0 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right']) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state maxQ = max(self.Q[state].values()) return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning and not self.Q.has_key(state): self.Q[state] = {None: 0., 'forward': 0., 'left': 0., 'right': 0.} return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state if self.learning: if random.uniform(0, 1) < self.epsilon: action = random.choice(self.valid_actions) else: action_list = [] mQ = self.get_maxQ(state) # for k, v in self.Q[state].items(): # if v == mQ: # action_list.append(k) # action = random.choice(action_list) max_keys = [k for k, v in self.Q[state].items() if v == mQ] action = random.choice(max_keys) # not learning else: action = random.choice(self.valid_actions) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: old_q = self.Q[state][action] pred_q = reward self.Q[state][action] += self.alpha * (pred_q - old_q) return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.train_no = 0 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 import math self.train_no = self.train_no + 1 # self.epsilon = math.e**(-0.01 * self.train_no) # self.alpha = math.e**(-0.001 * (self.train_no)) # self.epsilon = self.epsilon - 0.05*self.train_no # self.alpha = self.alpha - 0.05*self.train_no # self.alpha -= 0.005 # for Question 5, decay function self.epsilon -= 0.05 # for Question 6, use exponential function # self.epsilon = 0.7**self.train_no # results # Trial Aborted! # Agent did not reach the destination. # for Question 6, use 1/train_no/train_no # self.epsilon = 1/self.train_no **2 # results # Trial Aborted! # Agent did not reach the destination. # for Question 6, use exponential function # self.epsilon = math.e**(-0.01 * self.train_no) # self.alpha = math.e**(-0.001 * (self.train_no))did not use # results # Trial Completed! # Agent reached the destination. if testing: self.epsilon = 0 self.alpha = 0 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # NOTE : you are not allowed to engineer eatures outside of the inputs available. # Because the aim of this project is to teach Reinforcement Learning, we have placed # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn # about the balance between exploration and exploitation. # With the hand-engineered features, this learning process gets entirely negated. # Set 'state' as a tuple of relevant data for the agent state = (inputs['light'], inputs['oncoming'], inputs['left'], waypoint) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state # q = [self.Q.get((state,action), 0.0) for action in self.valid_actions] q = [self.Q[state][action] for action in self.Q[state]] if q: maxQ = max(q) else: maxQ = 0.0 #maxAction = max(self.Q[state], key=self.Q[state].get) #return self.Q[state][maxAction] return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning: if state not in self.Q: self.Q[state] = {} for action in self.valid_actions: self.Q[state][action] = 0.0 return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie". if not self.learning: action = random.choice(self.valid_actions) else: if random.random() < self.epsilon: action = random.choice(self.valid_actions) else: maxQ = self.get_maxQ(state) max_actions = [] for action in self.Q[state]: if self.Q[state][action] == maxQ: max_actions.append(action) action = random.choice(max_actions) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives a reward. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: self.Q[state][action] = self.Q[state][action] * ( 1 - self.alpha) + reward * self.alpha return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor # Set any additional class parameters as needed # Counter object for keeping track of trial No self.count = 0 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Update counter to keep track of trial No self.count += 1 # Select the destination as the new location to route to self.planner.route_to(destination) # Update alpha and epsilon parameters if testing == True: self.epsilon = 0 self.alpha = 0 else: # Update epsilon using decay function self.epsilon = 1 / (1 + math.exp(0.05 * (self.count - 150))) # Default agent epsilon function #self.epsilon = self.epsilon - 0.05 # Update additional class parameters as needed return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline # Set 'state' as a tuple of relevant data for the agent state = (waypoint, inputs) return state def get_state_key(self, state): """Unpacks state variable into a format appropriate for a dictionary key""" waypoint, inputs = state light = inputs['light'] left = inputs['left'] # excludes inputs['right'] oncoming = inputs['oncoming'] # Construct key state_key = (waypoint, light, left, oncoming) return state_key def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ state_key = self.get_state_key(state) # Calculate the maximum Q-value of all actions for a given state # In the case of a tie, max() chooses the value that came first. Since we are calling # max on a dictionary, the elements should be randomly ordered to begin with. maxQ = None maxQ_action = None for pair in self.Q[state_key].items(): action = pair[0] Qvalue = pair[1] if maxQ == None: maxQ = Qvalue maxQ_action = action else: if maxQ == Qvalue: maxQ, maxQ_action = random.choice([(maxQ, maxQ_action), (Qvalue, action)]) elif Qvalue > maxQ: maxQ = Qvalue maxQ_action = action else: continue return maxQ_action, maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. Updates Q-table with new state""" if self.learning == True: state_key = self.get_state_key(state) if state_key not in self.Q.keys(): self.Q[state_key] = { action: 0.0 for action in self.valid_actions } return None def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() # When not learning, choose a random action if self.learning == False: action = random.choice(self.valid_actions) # When learning, choose a random action with 'epsilon' probability if self.learning == True: if random.random() < self.epsilon: action = random.choice(self.valid_actions) # Otherwise, choose an action with the highest Q-value for the current state else: action, maxQ = self.get_maxQ( state) # See get_maxQ() for explaination of ties return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives a reward. This function does not consider future rewards when conducting learning. """ state_key = self.get_state_key(state) # When learning, implement the value iteration update rule if self.learning == True: self.Q[state_key][action] = self.Q[state_key][ action] + self.alpha * (reward - self.Q[state_key][action]) return None def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict() # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor # Set any additional class parameters as needed self.t = 0 random.seed(42) def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 if testing: self.epsilon = 0.0 self.alpha = 0.0 else: # commented out testing parameters self.t += 1.0 self.epsilon = float(1/(float(self.t **2))) return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense(self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline # Set 'state' as a tuple of relevant data for the agent # When learning, check if the state is in the Q-table # If it is not, create a dictionary in the Q-table for the current 'state' # For each action, set the Q-value for the state-action pair to 0 # helper to create state string state = (waypoint, inputs['light'], inputs['left'], inputs['oncoming']) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ # Calculate the maximum Q-value of all actions for a given state # preset an initialization value that should be replaced by a more valid Q value in the loop. maxQ = -1000.0 for action in self.Q[state]: if maxQ < self.Q[state][action]: maxQ = self.Q[state][action] return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning: self.Q[state] = self.Q.get(state, {None:0.0, 'forward':0.0, 'left':0.0, 'right':0.0}) return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state if not self.learning: action = random.choice(self.valid_actions) else: if self.epsilon > 0.01 and self.epsilon > random.random(): action = random.choice(self.valid_actions) else: valid_actions = [] maxQ = self.get_maxQ(state) for act in self.Q[state]: if maxQ == self.Q[state][act]: valid_actions.append(act) action = random.choice(valid_actions) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: self.Q[state][action] = self.Q[state][action] + self.alpha*(reward-self.Q[state][action]) return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=True, epsilon=0.5, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict() # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.t = 0 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 if testing: self.epsilon = 0.0 self.alpha = 0.0 else: #self.epsilon = self.epsilon - 0.05 #self.t=self.t+1 #self.epsilon = 1.0/(self.t**2) #self.epsilon = math.fabs(math.cos(self.alpha*self.t)) #self.epsilon = math.exp(-0.5 * self.t) self.epsilon = self.epsilon * 0.955 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense(self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # NOTE : you are not allowed to engineer features outside of the inputs available. # Because the aim of this project is to teach Reinforcement Learning, we have placed # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation. # With the hand-engineered features, this learning process gets entirely negated. # Set 'state' as a tuple of relevant data for the agent state = (waypoint, inputs['light'], inputs['oncoming']) print 'build_state', state return state def get_maxQ(self, state): """ The get_maxQ function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state print 'Access get_maxQ' maxQ= 0.0 ''' for action in self.Q[state]: if maxQ < self.Q[state][action]: maxQ = self.Q[state][action]''' #maxQ = max(self.Q[state].iterkeys(), key=(lambda k:self.Q[state][k])) for key,value in self.Q[state].iteritems(): maxQ = max(maxQ, value) print 'maxQ value', maxQ return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if not self.Q or state not in self.Q: self.Q[state] ={None:0.0, 'left':0.0, 'right':0.0, 'forward':0.0, 'green':0.0, 'red':0.0} #print 'selfQ list', self.Q return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie". print 'self_epsilon in choose_action fx', self.epsilon if not self.learning: action = random.choice(self.valid_actions) else: if self.epsilon >= random.random(): action = random.choice(self.valid_actions) else: action = self.Q[state].keys()[(self.Q[state].values()).index(self.get_maxQ(state))] #update the valid_actions with maxQ state #valid_actions = [] #maxQ = self.get_maxQ(state) #for act in self.Q[state]: # if maxQ == self.Q[state][act]: # valid_actions.append(act) # action = random.choice(valid_actions) print 'the chosen action:', action return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives a reward. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: self.Q[state][action] = self.alpha + self.alpha*reward #print 'selfQ in learn function', self.Q return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__( env ) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner( self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here valid_actions = [None, 'forward', 'left', 'right'] TL_valid_states = [True, False] state_desc = { 'light': TL_valid_states, 'oncoming': valid_actions, 'left': valid_actions, 'right': valid_actions, 'next_waypoint': ['forward', 'left', 'right'] } self.Q_prev = dict() self.gamma = 0.1 self.alpha = 0.2 self.p_threshold = 1 # self.state = TrafficLight def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint( ) # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) valid_actions = [None, 'forward', 'left', 'right'] # TODO: Update state self.state = (inputs['light'], self.next_waypoint) # TODO: Select action according to your policy Q_actions = [] for action_i in valid_actions: str_state_action = [str(s) for s in self.state] str_state_action.append(str(action_i)) str_state_action = ",".join(str_state_action) if len(self.Q_prev) == 0: self.Q_prev[str_state_action] = 0 Q_actions.append(0) else: if str_state_action in self.Q_prev.keys(): Q_actions.append(self.Q_prev[str_state_action]) else: self.Q_prev[str_state_action] = 0 Q_actions.append(0) Q_max = max(Q_actions) action_max_inds = [ valid_actions[i] for i in range(len(valid_actions)) if Q_max == Q_actions[i] ] action = random.choice(action_max_inds) str_state_action_now = [str(s) for s in self.state] str_state_action_now.append(str(action)) str_state_action_now = ",".join(str_state_action_now) #action_random = [None, 'forward', 'left', 'right'] #rand_action = action_random[random.randint(0,3)] #action = rand_action #print (self.state,action) # Execute action and get reward reward = self.env.act(self, action) self.Q_prev[str_state_action_now] = (1 - self.alpha) * self.Q_prev[ str_state_action_now] + self.alpha * (reward + self.gamma * Q_max)
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.9): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor self.counter = 1 ########### ## TO DO ## ########### # Set any additional class parameters as needed def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 if testing: self.alpha = 0 self.epsilon = 0 else: self.epsilon = 0.999**self.counter self.counter += 1 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent # When learning, check if the state is in the Q-table # If it is not, create a dictionary in the Q-table for the current 'state' # For each action, set the Q-value for the state-action pair to 0 #this reflects the final optimized model state = (waypoint, inputs['light'], inputs['left'], inputs['oncoming']) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state maxAction = max(self.Q[state], key=lambda x: self.Q[state][x]) maxQ = self.Q[state][maxAction] return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if state not in self.Q: self.Q[state] = {'left': 0, 'right': 0, 'forward': 0, None: 0} return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.old_state = self.state self.state = state self.next_waypoint = self.planner.next_waypoint() action = random.choice(self.valid_actions) ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state if self.learning and random.random() > self.epsilon: maxVal = self.get_maxQ(state) #choose randomly if there are more than one max values pos_actions = [ act for act in pos_actions if pos_actions[act] == maxVal ] action = random.choice(pos_actions) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') old_value = self.Q[state][action] if self.learning: self.Q[state][action] = ( 1 - self.alpha) * old_value + self.alpha * (reward) return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5, exploration=1): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor self.exploration = exploration # Exploration factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.t = 0 random.seed(666) def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 if testing: self.epsilon = 0.0 self.alpha = 0.0 else: #self.epsilon = self.epsilon - 0.05 self.t += 1.0 if self.exploration == 1: self.epsilon = 1.0 / (self.t**2) elif self.exploration == 2: self.epsilon = 1.0 / (self.t**2 + self.alpha * self.t) elif self.exploration == 3: self.epsilon = 1.0 / (self.t**2 - self.alpha * self.t) elif self.exploration == 4: self.epsilon = math.fabs(math.cos(self.alpha * self.t)) elif self.exploration == 5: self.epsilon = math.fabs(math.cos( self.alpha * self.t)) / (self.t**2) return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent # Create state string def state_str(state): if state is None: return 'None' else: return str(state) state = state_str(waypoint) + "_" + inputs['light'] + "_" + state_str( inputs['left']) + "_" + state_str(inputs['oncoming']) self.createQ(state) # Create 'state' in Q-table return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state maxQ = max(self.Q[state].values()) return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if (self.learning) and (state not in self.Q): self.Q[state] = {action: 0.0 for action in self.valid_actions} return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state possible_actions = self.Q[state] if not self.learning: action = random.choice(possible_actions) else: choose_using_epsilon = random.random() < 1 - self.epsilon if not choose_using_epsilon: maxQ = self.get_maxQ(state) best_actions = [ action for action in possible_actions if self.Q[state][action] == maxQ ] action = random.choice(best_actions) else: action = max(possible_actions.items(), key=lambda x: x[1])[0] return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: self.Q[state][action] = self.Q[state][action] + self.alpha * ( reward - self.Q[state][action]) return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here #Set traffic_light and movement traffic_light=["red","green"] motion = [None, 'forward', 'left', 'right'] waypoint,oncoming,left,right=motion,motion,motion,motion #Initialize q_table(States are traffic_light,waypoint,oncoming,left and right) #Set all features to 0 self.q_table = {} for light in traffic_light: for point in waypoint: for on in oncoming: for lf in left: for ri in right: self.q_table[(light, point, on, lf,ri)] = {None: 0, 'forward': 0, 'left': 0, 'right': 0} # print self.q_table self.episode=0 self.preserve=[] self.failure=0 def reset(self,destination=None,total=0): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.total_reward=total def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) old_time=t print 'Old_TIME',old_time # TODO: Update state self.state = (inputs['light'], self.next_waypoint, inputs['oncoming'], inputs['left'], inputs['right']) print "The current state is: {}".format(self.state) print "t:{}".format(t) # TODO: Select action according to your policy epsilon=0.1 #epsilon=1.0 rand=random.random() if rand<epsilon: action=random.choice(Environment.valid_actions[0:]) print "RANDOM ACTION" else: if max(self.q_table[self.state].values())==0: action=random.choice(Environment.valid_actions[0:]) print self.q_table[self.state] else: action = max(self.q_table[self.state], key=self.q_table[self.state].get) print action # Execute action and get reward reward = self.env.act(self, action) print "REWARD IS:",reward self.total_reward+=reward print "Total Reward",self.total_reward
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.trial = 1 self.epsilonDecayVariable = epsilon self.alphaDecayVariable = alpha def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice #self.epsilon =self.epsilon -.05 #?=e^ ?at ,for 0<a<1 #self.epsilon =math.exp(-self.epsilonDecayVariable*self.trial*1) #?=cos(at),for 0<a<1 #self.epsilon =math.cos(self.epsilonDecayVariable*self.trial) trialNum = self.trial ParmA = -5E-11 ParmB = 0.104 ParmC = 210 self.epsilon = 1 * math.exp( -ParmA * trialNum) / (1 + math.exp(ParmB * (trialNum - ParmC))) #self.epsilon=self.epsilon -.05 #self.epsilon=math.exp(-0.0005*self.trial)*(4+math.cos(.7*self.trial))/5 #=EXP (-lambda*A1)*(4+COS(freq*A1))/5 #decay learning rate #self.alpha =.75 #math.exp(-self.alphaDecayVariable*self.trial) #self.epsilon =math.pow(self.epsilonDecayVariable,self.trial ) # Update additional class parameters as needed #not sure on additional class parms -time remaining, success rate,etc.. ? self.trial = self.trial + 1 # If 'testing' is True, set epsilon and alpha to 0 if testing: self.epsilon = 0 self.alpha = 0 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent # When learning, check if the state is in the Q-table # If it is not, create a dictionary in the Q-table for the current 'state' # For each action, set the Q-value for the state-action pair to 0 state = (inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'], waypoint) if self.learning: if state not in self.Q: #state is in Q table already self.Q[state] = {None: 0, 'left': 0, 'right': 0, 'forward': 0} return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### #get the highest Q value from the dict of actions for this state: state parm. maxQ = max(self.Q[state].values()) #maxQ = None return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning: if state not in self.Q: self.Q[state] = {None: 0, 'left': 0, 'right': 0, 'forward': 0} return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() #action = random.choice(self.valid_actions) #print(action) ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state #get max q value maxQ = self.get_maxQ(state) #initially choose action with highest Q Value dictActions = self.Q[state] action = dictActions.keys()[dictActions.values().index(maxQ)] if self.learning: #if the random choice is within epsilon prob, then choose randomly. lDblRand = random.random() #is the random less than epsilon ? if lDblRand < self.epsilon: action = random.choice(self.valid_actions) else: print('not learning') return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: self.Q[state][action] = ( 1 - self.alpha) * self.Q[state][action] + (reward * self.alpha) return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5, epsilon_decay=0.05): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.epsilon_decay = epsilon_decay self.step = 0 self.global_step = 0 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 self.step = 0 self.nsteps = self.env.get_deadline(self) if testing: self.epsilon = 0.0 self.alpha = 0.0 else: #self.epsilon -= self.epsilon_decay self.epsilon = math.cos(self.global_step * self.epsilon_decay) if self.epsilon < 0.0: self.epsilon = 0.0 #self.alpha = 0.98 * self.alpha return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent # When learning, check if the state is in the Q-table # If it is not, create a dictionary in the Q-table for the current 'state' # For each action, set the Q-value for the state-action pair to 0 state = tuple({ 'waypoint': waypoint, 'light': inputs['light'], 'left': 'forward' if inputs['left'] == 'forward' else None, 'oncoming': inputs['oncoming'] }.items()) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state maxQs = list() if state in self.Q.keys(): max_value = float('-inf') for action in self.Q[state].keys(): value = self.Q[state][action] #print("Action found for state: %s - value: %f" % (str(action), value)) if value > max_value: max_value = value maxQs = [action] elif value == max_value: maxQs.append(action) #print("maxQs: %s" % str(maxQs)) if (len(maxQs) == 0): return None return maxQs[0] if (len(maxQs) == 1) else random.choice(maxQs) def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning and state not in self.Q.keys(): self.Q[state] = { 'forward': 0.0, 'left': 0.0, 'right': 0.0, None: 0.0 } return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability self.step += 1 self.global_step += 1 # Otherwise, choose an action with the highest Q-value for the current state if self.learning == True: choice = np.random.choice((0, 1), p=(self.epsilon, 1 - self.epsilon)) if choice == 0: print( "### Explore (step %d, nsteps %d, epsilon %f, epsilon*nsteps %f)" % (self.step, self.nsteps, self.epsilon, self.epsilon * self.nsteps)) action = random.choice(self.valid_actions) else: print( "@@@ Exploit (step %d, nsteps %d, epsilon %f, epsilon*nsteps %f)" % (self.step, self.nsteps, self.epsilon, self.epsilon * self.nsteps)) action = self.get_maxQ(state) else: action = random.choice(self.valid_actions) print("self.learning: %s" % str(self.learning)) print("Q-learning database size: %d" % len(self.Q)) print("self.valid_actions: %s" % str(self.valid_actions)) print("action: %s" % str(action)) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning == False: return if state in self.Q.keys(): old_value = self.Q[state][action] self.Q[state][action] = old_value + self.alpha * (reward - old_value) #print("Action %s got updated to %f\n" % (action, self.Q[state][action])) return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict() # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed #self.trial = 0 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 global trial trial = trial + 1 if testing == False: self.epsilon -= 0.05 #self.epsilon = math.pow(0.97, trial) elif testing == True: self.epsilon = 0 self.alpha = 0 print self.epsilon return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense(self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent #state = (waypoint, inputs) state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left']) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state maxQ = max(self.Q[state].values()) return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning: if self.state not in self.Q: self.Q[state] = {None : 0.0, 'left': 0.0, 'right' : 0.0, 'forward' : 0.0} return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() #action = 'forward' #action = None action_list = [None, 'left', 'right', 'forward'] #action = random.choice(action_list) ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state if self.learning == False: action = random.choice(action_list) elif self.learning == True: if random.random() <= self.epsilon: action = random.choice(action_list) #print action #print self.Q else: #print self.Q self.createQ(self.state) #print self.Q[self.state] #print self.get_maxQ(self.state) action = self.get_action(self.Q[self.state], self.get_maxQ(self.state)) print action return action def get_action(self, dictionary, value): for k, v in dictionary.iteritems(): if v == value: return k def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') self.createQ(self.state) #self.Q[state][action] = (1- self.alpha) * self.Q[state][action] + self.alpha * reward self.Q[state][action] = reward return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.t = 0 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 if testing: self.epsilon = 0 self.alpha = 0 else: self.t += 1 # self.epsilon-=0.05 # self.epsilon=(self.alpha**self.t) # self.epsilon=1.0/(self.t**2) # self.epsilon=math.exp(-(self.alpha * self.t)) self.epsilon = math.fabs(math.cos(self.alpha * self.t)) #self.epsilon = math.fabs(math.cos(self.alpha*self.t)) # self.epsilon -= self.epsilon / 100 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # NOTE : you are not allowed to engineer eatures outside of the inputs available. # Because the aim of this project is to teach Reinforcement Learning, we have placed # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation. # With the hand-engineered features, this learning process gets entirely negated. # Set 'state' as a tuple of relevant data for the agent # state = (waypoint, # inputs['left'], # inputs['light'], # inputs['oncoming']) # stringify = lambda s: 'None' if s is None else str(s) # state = [stringify(s) for s in state] state = (waypoint, inputs['light'], inputs['left'], inputs['right'], inputs['oncoming']) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state # maxQ = max(self.Q[state].values()) # # return maxQ if state in self.Q: # Get all the Action on current state state_actions = self.Q[state] # Get all the values of actions at current state state_actions_val = state_actions.values() # Take maximum value of action value on current state maxQ = max(state_actions_val) print('MaxWQ', maxQ) return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 # Set Default value of action action_initial_val = 0.0 # If learning is true if self.learning is True: if state not in self.Q: # self.Q = dict() # Adding all the validate action at current state self.Q[state] = { action: action_initial_val for action in self.valid_actions } return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie". if self.learning is not True: action = random.choice(self.valid_actions) elif self.learning is True: if self.epsilon > random.random(): print('Learning True') action = random.choice(self.valid_actions) else: # print(state) # Max Q-value at current state maxQ_Val = self.get_maxQ(state) # print('maxQ_val', maxQ_Val) # Actions list at current state action_at_state = self.Q[state] # print('action_at_state', action_at_state) max_Q_actions = [ val[0] for val in action_at_state.items() if val[1] == maxQ_Val ] # print('max_Q_actions', max_Q_actions) action = random.choice(max_Q_actions) # print('max_Q_actions action', action) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives a reward. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning is True: self.Q[state][action] = ( 1 - self.alpha) * self.Q[state][action] + (self.alpha * reward) return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__( env ) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner( self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.serial = 0 # initialize Q-matrix self.Qmat = {} # initialize other variables # Basic Q-learning #self.alpha = 0.8 #self.gamma = 0.5 #self.epsilon = 0.0 # Enhanced Q-learning #self.alpha = 0.8 self.alpha = 0.4 #self.gamma = 1.0 #self.gamma = 0.5 self.gamma = 0.2 self.epsilon = 1.0 #self.success = 0. # This function reaches at the very beginning of the script # Open a csv file to keep track of the agent's success rate with open("SimulationResults_Enhanced.csv", "a") as outputFile: #with open("SimulationResults.csv","a") as outputFile: outputFile.write("\n{},{},{},{},".format("Gamma", self.gamma, "Alpha", self.alpha)) def reset(self, destination=None): self.planner.route_to(destination) # This function reaches at every start of a new trial # TODO: Prepare for a new trip; reset any variables here, if required #self.serial = 0 self.serial += 1 def update(self, t): # ----------------------------- check information from previous time step # Gather inputs self.next_waypoint = self.planner.next_waypoint( ) # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state #printOK = True printOK = False self.state = (inputs['light'], self.next_waypoint) if printOK: print "1. current state is:", self.state # TODO: Select action according to your policy epsilon = (self.epsilon / (self.epsilon + self.serial)) if self.state in self.Qmat.keys(): if printOK: print " 1.a ----------------------------------{} exists".format( self.state) action_key_value = self.Qmat[ self.state] # this shows key/value of selected state in Qmat if printOK: print " Q table for this state: ", action_key_value # when multiple maxima exist - random selection actions_max = {actions:action_value for actions, action_value \ in action_key_value.items() if action_value == max(action_key_value.values())} action = random.choice(actions_max.keys()) if printOK: print " available actions={}".format(actions_max) if printOK: print " learned_action={}".format(action) # "exploring" takes place randomly if random.random() < epsilon: action = random.choice([None, 'forward', 'left', 'right']) if printOK: print " learned_action={} (random)".format(action) else: if printOK: print " 1.b ----------------------------------{} NEW state".format( self.state) self.Qmat[self.state] = { None: 0.0, 'forward': 0.0, 'left': 0.0, 'right': 0.0 } # random action, when landed on a 'new' state action = random.choice([None, 'forward', 'left', 'right']) if printOK: print " random action={}".format(action) # Execute action and get reward reward = self.env.act(self, action) if printOK: print " Reward for this action={}".format(reward) # TODO: Learn policy based on state, action, reward # New state definition seems incorrect # FIX: New definition of next state: state and waypoint after action was determined above! inputs_new = self.env.sense(self) # This is the key; next_waypoint should be called twice: once above and once here! self.next_waypoint = self.planner.next_waypoint() new_state = (inputs_new['light'], self.next_waypoint) if printOK: print "2. next state is:", new_state if new_state in self.Qmat.keys(): Q_next = max(self.Qmat[new_state].values()) if printOK: print " 2.a -----------------------------Q table for NEXT state: ", self.Qmat[ new_state] else: if printOK: print " 2.a -----------------------------Q table for NEXT state: NOT FOUND" Q_next = 0. if printOK: print " 2.b -----------------------------Q[new state]=", Q_next Qhat = (1.0 - self.alpha) * self.Qmat[ self.state][action] + self.alpha * (reward + self.gamma * Q_next) self.Qmat[self.state][action] = Qhat # print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug] if printOK: print "3. CONCLUSION" if printOK: print " action is {}, state is {}".format(action, self.state) if printOK: print " Q table for (s,a) is: {}".format(self.Qmat[self.state])
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=True, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.t = 0 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) def decay_exponential(a): self.epsilon = a**self.t ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 if testing == True: self.epsilon = 0.0 self.alpha = 0.0 else: self.t += 1 decay_exponential(0.998) return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # NOTE : you are not allowed to engineer features outside of the inputs available. # Because the aim of this project is to teach Reinforcement Learning, we have placed # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation. # With the hand-engineered features, this learning process gets entirely negated. # Set 'state' as a tuple of relevant data for the agent state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right']) return state def get_maxQ(self, state): """ The get_maxQ function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state maxQ = max(self.Q[state].values()) return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning: if state not in self.Q: self.Q[state] = { None: 0.0, 'forward': 0.0, 'left': 0.0, 'right': 0.0 } # Init default Q Score. return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie". if not self.learning: action = random.choice(self.valid_actions) else: if self.epsilon > random.random(): action = random.choice(self.valid_actions) else: max_Q = max(self.Q[state].values()) max_candidate = [ x for x in self.Q[state] if self.Q[state][x] == max_Q ] action = random.choice(max_candidate) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives a reward. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: try: self.Q[state][action] = (1 - self.alpha) * ( self.Q[state][action]) + self.alpha * reward except KeyError: pass return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.75): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions self.action_list = ['None', 'left', 'right', 'forward'] # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.n_trial = 0 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 self.n_trial += 1 #self.alpha = 1.0 / self.n_trial**2 #self.epsilon = 1.0/ (self.n_trial**2) self.epsilon = 1.0 / self.n_trial #self.epsilon = 1.0/ ((self.n_trial+1.0)/2.0) #self.epsilon = 1.0 - (self.n_trial**1.5/50.0) #self.epsilon -= .05 if testing == True: self.epsilon = 0 self.alpha = 0 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent # When learning, check if the state is in the Q-table # If it is not, create a dictionary in the Q-table for the current 'state' # For each action, set the Q-value for the state-action pair to 0 state = (inputs['oncoming'], inputs['left'] == 'forward', inputs['light'], waypoint) if self.learning == True: if str(state) not in self.Q.keys(): self.Q[str(state)] = dict() for action in self.action_list: self.Q[str(state)].update({action: 0.0}) print(self.Q) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state # I found this code on stack overflow to get the max key maxQ = self.Q[str(state)][str( max(self.Q[str(state)], key=lambda key: self.Q[str(state)][key]))] return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning == True: if str(self.state) not in self.Q.keys(): self.Q[str(state)] = dict() for action in self.action_list: self.Q[str(state)].update({action: 0.0}) return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state #random action action_list = [] if self.learning == False: action = self.valid_actions[random.randint(0, 3)] elif self.epsilon > random.random() and self.epsilon > 0: action = self.valid_actions[random.randint(0, 3)] else: for i in self.Q[str(state)]: if self.Q[str(state)][i] == self.get_maxQ(state): action_list.append(str(i)) if len(action_list) > 1: action = action_list[random.randint(0, len(action_list) - 1)] else: action = action_list[0] if action == 'None': action = None return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning == True: currentQ = self.Q[str(state)][str(action)] self.Q[str(state)][str( action)] = reward * self.alpha + currentQ * (1 - self.alpha) #self.alpha = self.alpha**1.5 return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env, **kwargs): super(LearningAgent, self).__init__( env ) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner( self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here add_total = 0 add_total = False self.success = 0 self.total = 0 self.counter = 0 self.epsilon_reset_counter = 0 self.trial_counter = 0.0 self.min_epsilon = 0.001 self.eps_freq = 1.0 self.filled_cell_count = 0 self.total_cell_count = 0 self.updated_func_counter = 0 global stats_df_counter global stats_df for key, value in kwargs.iteritems(): print "%s = %s" % (key, value) if key == 'alp': self.alpha = value elif key == 'gma': self.gamma = value elif key == 'eps': self.epsl = value self.epsilon = self.epsl print "epsilon: ", self.epsilon self.qt = QTable(self.alpha, self.gamma) print '-' * 80 def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required totalTime = self.env.get_deadline(self) self.qt.printVal(totalTime) self.trial_counter += 1.0 if self.epsilon > self.min_epsilon: self.epsilon = (5.0 * self.epsl) / self.trial_counter self.eps_freq = math.ceil(1.0 / self.epsilon) print "self.epsilon:", self.epsilon, ", self.eps_freq: ", self.eps_freq, "\n" def update(self, t): global stats_df global stats_df_counter self.counter += 1 # Gather inputs self.next_waypoint = self.planner.next_waypoint( ) # from route planner, also displayed by simulator current_state = self.env.sense(self) self.state = current_state deadline = self.env.get_deadline(self) # TODO: Update state # TODO: Select action according to your policy #action = random.choice([None, 'forward', 'left', 'right']) #if self.total > 0 and self.total % self.epsilon_freq == 0.0: # print "simulated annealing at ", self.total # action = random.choice([None, 'forward', 'left', 'right']) #else: if self.epsilon > self.min_epsilon and deadline != 0 and deadline != self.eps_freq and math.floor( deadline % self.eps_freq) == 0.0: #self.epsilon_reset_counter += 1 action = random.choice([None, 'forward', 'left', 'right']) print "annealing now.", "self.epsilon:", self.epsilon, ", action: ", action, ", deadline:", deadline else: #print "self.counter: ", self.counter, ", multiplier:", (self.counter * self.epsilon) action = self.qt.get_next_action(self.next_waypoint, deadline, current_state) # Execute action and get reward reward = self.env.act(self, action) add_total = False if deadline == 0: add_total = True if reward > 10: self.success += 1 add_total = True if add_total: self.total += 1 print("success: {} / {}".format(self.success, self.total)) if self.total == 100: for item, frame in self.qt.qtable.iteritems(): for item2, frame2 in frame.iteritems(): for item3, frame3 in frame2.iteritems(): for item4, frame4 in frame3.iteritems(): self.total_cell_count += 1 #print("f4:", frame4) if frame4 != 0.0: #print "\n" self.printNav(item2) self.printTraffic(item3) self.printTrafficLight(item4) self.printAction(item) print "Q-Val: {0:.5f}".format(frame4) self.filled_cell_count += 1 print '-' * 80 print "updated cells: ", self.filled_cell_count, ", self.total_cell_count:", self.total_cell_count, ", updated_func_counter:", self.updated_func_counter print "self.alpha:", self.alpha, "self.gamma:", self.gamma, ", self.epsilon:", self.epsl, ", success:", self.success, " in steps: ", deadline stats_df.loc[stats_df_counter] = [ self.alpha, self.gamma, self.epsl, self.success, deadline ] stats_df_counter += 1 print '_' * 80 # print '_'*20 # TODO: Learn policy based on state, action, reward next_state_value = self.env.sense(self) next_state_deadline = self.env.get_deadline(self) next_state_waypoint = self.planner.next_waypoint() self.qt.update(self.next_waypoint, deadline, current_state, action, reward, next_state_value, next_state_waypoint, self, self.env) self.updated_func_counter += 1 def printAction(self, code): print '|', if code == 'AN': print "Action: None", elif code == 'BF': print "Action: Forward", elif code == 'CR': print "Action: Right", elif code == 'DL': print "Action: Left", print '|', def printNav(self, code): print '|', if code == 0: print "Nav: None", elif code == 1: print "Nav: Forward", elif code == 2: print "Nav: Right", elif code == 3: print "Nav: Left", def printTraffic(self, code): left_mask = 0b000011 right_mask = 0b001100 oncoming_mask = 0b110000 left_filtered = code & left_mask right_filtered = code & right_mask oncoming_filtered = code & oncoming_mask print '| Traffic state: ', if left_filtered == 0: print "Left: None", elif left_filtered == 1: print "Left: Forward", elif left_filtered == 2: print "Left: Right", elif left_filtered == 3: print "Left: Left", print '-+-', if right_filtered == 0: print "Right: None", elif right_filtered == 4: print "Right: Forward", elif right_filtered == 8: print "Right: Right", elif right_filtered == 12: print "Right: Left", print '-+-', if oncoming_filtered == 0: print "Oncoming: None", elif oncoming_filtered == 16: print "Oncoming: Forward", elif oncoming_filtered == 32: print "Oncoming: Right", elif oncoming_filtered == 48: print "Oncoming: Left", def printTrafficLight(self, code): print '| ', if code == 0: print "Light: Red", else: print "Light: Green",
class QLearningAgent(Agent): """An agent that learns to drive in the smartcab world using Q learning""" def __init__(self, env): super(QLearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'yellow' # OverflowError(" error")ide color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint ##initialize q table here self.Q = dict() self.alpha = 1.0 self.epsilon = 0.0 self.gamma = 0.05 self.discount = self.gamma self.state = None def flipCoin(self, p ): r = random.random() return r < p def reset(self, destination=None): self.planner.route_to(destination) def getQValue(self, state, action): return self.Q.get((state, action), 1) def getValue(self, state): for action in Environment.valid_actions: if(self.getQValue(state, action) > 0): bestQValue = self.getQValue(state, action) return bestQValue def getPolicy(self, state): bestQValue = 0 for action in Environment.valid_actions: if(self.getQValue(state, action) > bestQValue): bestQValue = self.getQValue(state, action) bestAction = action if(self.getQValue(state, action) == bestQValue): random.choice(Environment.valid_actions) bestQValue = self.getQValue(state, action) bestAction = action return bestAction def getAction(self, state): action = None if (self.flipCoin(self.epsilon)): action = random.choice(Environment.valid_actions) else: action = self.getPolicy(state) return action def qTable(self, state, action, nextState, reward): if((state, action) not in self.Q): self.Q[(state, action)] = 1 else: self.Q[(state, action)] = self.Q[(state, action)] + self.alpha*(reward + self.discount*self.getValue(nextState) - self.Q[(state, action)]) def update(self, t): self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator deadline = self.env.get_deadline(self) inputs = self.env.sense(self) inputs['waypoint'] = self.next_waypoint self.state = tuple(sorted(inputs.items())) action = self.getAction(self.state) reward = self.env.act(self, action) self.prevAction = action self.prevState = self.state self.prevReward = reward if self.prevReward!= None: self.qTable(self.prevState,self.prevAction,self.state,self.prevReward) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)
class LearningAgent(Agent): """An agent that learns how to drive in the smartcab world.""" def __init__(self, env, init_value=0, gamma=0.90, alpha=0.20, epsilon=0.10, discount_deadline=False, history=0): super(LearningAgent, self).__init__( env ) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override default color self.planner = RoutePlanner( self.env, self) # simple route planner to get next_waypoint ## Initialize the Q-function as a dictionary (state) of dictionaries (actions) self.q_function = {} self.history = history if self.history > 0: self.init_q_function() ## Initial value of any (state, action) tuple is an arbitrary random number self.init_value = init_value ## Discount factor gamma: 0 (myopic) vs 1 (long-term optimal) self.gamma = gamma self.discount_deadline = discount_deadline ## Learning rate alpha: 0 (no learning) vs 1 (consider only most recent information) ## NOTE: Normally, alpha decreases over time: for example, alpha = 1 / t self.alpha = alpha ## Parameter of the epsilon-greedy action selection strategy ## NOTE: Normally, epsilon should also be decayed by the number of trials self.epsilon = epsilon ## The trial number self.trial = 1 ## The cumulative reward self.cumulative_reward = 0 def get_q_function(self): return self.q_function def set_params(self, init_value=0, gamma=0.90, alpha=0.20, epsilon=0.10, discount_deadline=False, history=0): self.init_value = init_value self.gamma = gamma self.alpha = alpha self.epsilon = epsilon self.discount_deadline = discount_deadline self.history = history def reset(self, destination=None): self.planner.route_to(destination) self.env.set_trial_number(self.trial) self.trial += 1 ## Decay the epsilon parameter # self.epsilon = self.epsilon / math.sqrt(self.trial) # TODO: Prepare for a new trip; reset any variables here, if required self.cumulative_reward = 0 def init_q_function(self): ''' Initializes the Q-tables with previously learned results. ''' csv_files = glob.glob('../q_tables/*.csv') history_counter = 0 state_counter = {} for csv_file in csv_files: q_df = pd.read_csv(csv_file, sep=',', header=None) ## Assign the header header = ['state' ] + [str(action) for action in self.env.valid_actions] q_df.columns = header for i in xrange(q_df.shape[0]): state = q_df.ix[i]['state'] state = state[1:-1] state_tuple = literal_eval(state) if state_tuple in self.q_function: action_function = self.q_function[state_tuple] for action in self.env.valid_actions: current_value = action_function[action] action_function[action] = current_value + q_df.ix[i][ str(action)] self.q_function[state_tuple] = action_function ## Update the frequency counter counter = state_counter[state_tuple] state_counter[state_tuple] = counter + 1 else: action_function = {} for action in self.env.valid_actions: action_function[action] = q_df.ix[i][str(action)] self.q_function[state_tuple] = action_function state_counter[state_tuple] = 1 history_counter += 1 if history_counter >= self.history: break ## Average all action values for state in state_counter.keys(): count = state_counter[state] action_function = self.q_function[state] for action in self.env.valid_actions: current_value = action_function[action] action_function[action] = current_value / count self.q_function[state] = action_function def select_action(self, state=None, is_current=True, t=1): ''' Implements the epsilon-greedy action selection that selects the best-valued action in this state (if is_current) with probability (1 - epsilon) and a random action with probability epsilon. 't' is the current time step that can be used to modify epsilon. ''' if state in self.q_function: ## Find the action that has the highest value action_function = self.q_function[state] q_action = max(action_function, key=action_function.get) if is_current: ## Generate a random action rand_action = random.choice(self.env.valid_actions) ## Select action using epsilon-greedy heuristic rand_num = random.random() action = q_action if rand_num <= ( 1 - self.epsilon) else rand_action else: action = q_action else: ## Initialize <state, action> pairs and select random action action_function = {} for action in self.env.valid_actions: action_function[action] = self.init_value self.q_function[state] = action_function action = random.choice(self.env.valid_actions) return action def update(self, t): ''' At each time step t, the agent: - Is given the next waypoint (relative to its current location and direction) - Senses the intersection state (traffic light and presence of other vehicles) - Gets the current deadline value (time remaining) ''' ## The destination trying to reach # destination = self.env.agent_states[self]['destination'] ## Observe the current state variables ## (1) Traffic variables inputs = self.env.sense(self) light = inputs['light'] oncoming = inputs['oncoming'] left = inputs['left'] ## (2) Direction variables self.next_waypoint = self.planner.next_waypoint( ) # from route planner, also displayed by simulator ## Update the current observed state self.state = (light, oncoming, left, self.next_waypoint) current_state = self.state # save this for future use ## Select the current action action = self.select_action(state=current_state, is_current=True, t=t) ## Execute action, get reward and new state reward = self.env.act(self, action) self.cumulative_reward += reward self.env.set_cumulative_reward(self.cumulative_reward) ## Retrieve the current Q-value current_q = self.q_function[self.state][action] # print 'Current Q = ' + str(current_q) ## Update the state variables after action ## (1) Traffic variables new_inputs = self.env.sense(self) light = new_inputs['light'] oncoming = new_inputs['oncoming'] left = new_inputs['left'] ## (2) Direction variables self.next_waypoint = self.planner.next_waypoint() if self.discount_deadline: deadline = self.env.get_deadline(self) if t == 1: ## TODO: Set this as an input param self.gamma = 1 - float(4) / deadline ## Update the new state, which is a tuple of state variables self.state = (light, oncoming, left, self.next_waypoint) ## Get the best q_action in the new state new_action = self.select_action(state=self.state, is_current=False, t=t) ## Get the new Q_value new_q = self.q_function[self.state][new_action] ## Update the Q-function # current_alpha = 1 / math.sqrt(t+1) current_alpha = self.alpha self.q_function[current_state][action] = ( 1 - current_alpha) * current_q + current_alpha * ( reward + self.gamma * new_q) # print 'Updated Q = ' + str(self.q_function[current_state][action]) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format( deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.T = 0.0 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 if testing: self.epsilon = 0.0 self.alpha = 0.0 else: self.T = self.T + 1.0 #self.epsilon = self.epsilon - 0.05; #self.epsilon = 1.0 / ( (self.T * self.T) + (self.alpha * self.T)) #self.epsilon = self.epsilon = 1.0 / (self.T **2) self.epsilon = math.fabs(math.cos(self.alpha * self.T)) return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent state = { "light": inputs["light"], "oncoming": inputs["oncoming"], "left": inputs["left"], "direction": waypoint } return state def get_maxQ(self, hash_key): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state maxQ = -1000.0 for action in self.Q[hash_key]: if maxQ < self.Q[hash_key][action]: maxQ = self.Q[hash_key][action] return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 state_key = self.hash_key(state) print "state: {}".format(state_key) if self.learning: self.Q[state_key] = self.Q.get(state_key, { None: 0.0, 'forward': 0.0, 'left': 0.0, 'right': 0.0 }) #print self.Q return state return def hash_key(self, state): """ Method to create the key string object based on the state passed """ light = state['light'] oncoming = state['oncoming'] left = state['left'] direction = state['direction'] if light is None: light = 'None' if oncoming is None: oncoming = 'None' if left is None: left = 'None' if direction is None: direction = 'None' return str(light + '_' + oncoming + '_' + left + '_' + direction) def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state if not self.learning: actionSelected = random.choice(self.valid_actions) print "Not Learning....setting action: {}".format( random.choice(self.valid_actions)) action = actionSelected else: if self.epsilon > 0.01 and self.epsilon > random.random(): action = random.choice(self.valid_actions) else: possible_actions = [] hash_key = self.hash_key(state) maxQ = self.get_maxQ(hash_key) for action in self.Q[hash_key]: if maxQ == self.Q[hash_key][action]: possible_actions.append(action) action = random.choice(possible_actions) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') hash_key = self.hash_key(state) if self.learning: self.Q[hash_key][action] = self.Q[hash_key][ action] + self.alpha * (reward - self.Q[hash_key][action]) #self.Q[hash_key][action] = (1 - self.alpha) * self.Q[hash_key][action] + (reward * self.alpha) return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor # Set any additional class parameters as needed self.t = 0 self.state_def = [ ['left', 'right', 'forward'], #waypoint ['red', 'green'], #light ['left', 'right', 'forward', None], #vehicleleft ['left', 'right', 'forward', None], #vehicleright ['left', 'right', 'forward', None] #vehicleoncoming ] self.template_q = dict((k, 0.0) for k in self.valid_actions) for state_tuple in itertools.product(*self.state_def): self.Q[state_tuple] = self.template_q.copy() def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) if testing: self.epsilon = 0 self.alpha = 0 else: # Use negative exponential e^(-at) decay function self.epsilon = math.exp(-self.alpha * self.t) self.t += 1 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline # Set 'state' as a tuple of relevant data for the agent. Ensure it is the same order as # in the class initializer state = (waypoint, inputs['light'], inputs['left'], inputs['right'], inputs['oncoming']) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ # Calculate the maximum Q-value of all actions for a given state maxQ = max(self.Q[state].values()) maxQ_actions = [] for action, Q in self.Q[state].items(): if Q == maxQ: maxQ_actions.append(action) return maxQ, maxQ_actions def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ # When learning, check if the 'state' is not in the Q-table if not self.learning: return # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 # Note: This is already performed in class constructor, but replicated here too if not state in self.Q: self.Q[state] = self.template_q.copy() return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state if not self.learning or random.random() <= self.epsilon: action = random.choice(self.valid_actions) else: maxQ, maxQ_actions = self.get_maxQ(state) action = random.choice(maxQ_actions) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: self.Q[state][action] = reward * self.alpha + self.Q[state][ action] * (1 - self.alpha) # I've decided to keep alpha constant, so it is not altered return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.t = 0 self.a = 0.005 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 self.epsilon = math.cos((self.a) * self.t) # self.alpha = math.exp((-self.a) * self.t) # self.epsilon = 1/(self.t ** 2) # self.epsilon -= 0.05 self.t = self.t + 1 if testing == True: self.epsilon = 0 self.alpha = 0 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent state = ( waypoint, inputs['light'], # inputs['left'], inputs['left'], inputs['oncoming']) # When learning, check if the state is in the Q-table # If it is not, create a dictionary in the Q-table for the current 'state' # For each action, set the Q-value for the state-action pair to 0 if self.learning == True: try: self.Q[state] except KeyError: self.Q[state] = dict() for action in self.env.valid_actions: self.Q[state][action] = 0 # state = None return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state maxQ = None for action in self.Q[state]: if self.Q[state][action] > maxQ: maxQ = self.Q[state][action] return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning == True: try: self.Q[state] except KeyError: self.Q[state] = dict() for action in self.env.valid_actions: self.Q[state][action] = 0.0 return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state if self.learning == False: action = random.choice(self.env.valid_actions) else: if random.random() < self.epsilon: action = random.choice(self.env.valid_actions) else: action_choice = [] for actions, value in self.Q[state].items(): if value == self.get_maxQ(state): action_choice.append(actions) action = random.choice(action_choice) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning == True: #current_value = self.Q[state][action] #learned_value = reward + self.get_maxQ(state) #new_q_value = learned_value + (self.alpha * (learned_value - current_value)) self.Q[state][action] += self.alpha * (reward - self.Q[state][action]) return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=True, epsilon=0.5, alpha=0.5, gamma=0): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor self.gamma = gamma # Discount factor ########### ## TO DO ## ########### # Set any additional class parameters as needed def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Create a series of waypoints self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent # When learning, check if the state is in the Q-table # If it is not, create a dictionary in the Q-table for the current 'state' # For each action, set the Q-value for the state-action pair to 0 state = None return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maxmimum Q-value of all actions for a given state maxQ = None return maxQ def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state return action def learn(self, state, action, reward, new_state): """ The learn function is called after the agent completes an action and receives an award. 'new_state' is the state the smartcab arrives at once completing the action. This is for calculating future rewards. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use both the learning rate 'alpha', and the discount factor, 'gamma' return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ # Update the agent based on the functions built above. state = self.build_state() # Build the agent pre-action state action = self.choose_action( state) # Choose an action based on the agent state reward = self.env.act(self, action) # Receive a reward based on the action new_state = self.build_state() # Build the agent's post-action state self.learn(state, action, reward, new_state) # Run the Q-Learning algorithm return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 if testing == True: self.epsilon = 0 self.alpha = 0 else: self.epsilon = self.epsilon - 0.05 # checks it is not negative if self.epsilon <= 0: self.epsilon = 0 return self.epsilon def build_state(self): """ The build_state function is called when the agent r[equests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # NOTE : you are not allowed to engineer eatures outside of the inputs available. # Because the aim of this project is to teach Reinforcement Learning, we have placed # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation. # With the hand-engineered features, this learning process gets entirely negated. # Set 'state' as a tuple of relevant data for the agent state = (waypoint, inputs['light'], inputs['oncoming']) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state maxQ = max(self.Q[state], key=lambda key: self.Q[state][key]) return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ # valid_actions = [None, 'forward', 'left', 'right'] ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.Q.has_key(state) == False: self.Q[state] = {None: 0, 'forward': 0, 'left': 0, 'right': 0} return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie". # To remember where it came from... https://docs.python.org/2/library/random.html # https://junedmunshi.wordpress.com/tag/epsilon-policy/ if self.learning == False: newmove = random.choice(self.valid_actions) else: if random.random() < self.epsilon: newmove = random.choice(self.valid_actions) else: newmove = self.get_maxQ(state) print("New MAX move is", newmove) action = newmove return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives a reward. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning == True: #lastStateActionValue = lastStateActionValue + (self.alpha * (reward + maxQ - lastStateActionValue)) #actionsForLastState[lastAction] = lastStateActionValue #self.Q[lastState] = actionsForLastState print("learnig in state:", state) print("learning in action:", action) print("self.Q[state][action]", self.Q[state][action]) currentQ = self.Q[state][action] self.Q[state][action] = reward * self.alpha + currentQ * ( 1 - self.alpha) print("NEW self.Q[state][action]", self.Q[state][action]) #self.Q[self.prev_state][self.prev_action] = ((1 - alpha)* self.Q[self.prev_state][self.prev_action]) + alpha * (self.prev_reward + gamma *maxQ) return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn print("Finished Learning....") return
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env_): super(LearningAgent, self).__init__( env_ ) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner( self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.state = {} self.actions = [None, 'forward', 'left', 'right'] self.state_roots = [ 'light', 'next_waypoint', 'right', 'left', 'oncoming' ] #self.state_roots = ['light', 'next_waypoint'] self.Qtable = {} #tuning variables self.gamma = 0.1 # discount factor self.alpha = 0.9 # learning rate self.epsilon = 10 # exploration rate (select random action every x iteration) self.overall_simulations = 0 self.overall_iterations = 0 self.total_sucess = 0 def reset(self, destination=None): self.overall_simulations += 1 self.planner.route_to(destination) print "\n-----starting new simulation-----" self.print_Qtable() print "overall_iterations: ", self.overall_iterations print "overall_simulations: ", self.overall_simulations print "total_sucess: ", self.total_sucess print "self.gamma: ", self.gamma if self.total_sucess == 0: print "sucess_rate: 0%" else: sucess_rate = float(self.total_sucess) / float( self.overall_simulations) print "sucess_rate: ", sucess_rate * 100, "%" def add_new_state_to_Qtable(self, new_state_): #print "ADDING NEW STATE: ", new_state_ action_state_init = 2 init_actions = [(None, action_state_init), ('forward', action_state_init), ('left', action_state_init), ('right', action_state_init)] state_tuple = {new_state_: init_actions} self.Qtable.update(state_tuple) return init_actions def print_Qtable(self): print "self.Qtable:" for row in self.Qtable: print "\t", row, self.Qtable[row] def get_action_from_Qtable(self, state_): state_tuple = self.Qtable.get(str(state_)) new_action_tuple = None if state_tuple is None: # if the state is not in Q-table create it and choose random action state_actions = self.add_new_state_to_Qtable(state_) new_action_tuple = random.choice(state_actions) #print "new_random_action: ", new_action_tuple else: # state is in Q_table so get the maximum #print "found_existing_Qtable_state: ", state_tuple # if all items have the same Q_value, just return a random one Q_values = map(lambda state: state[1], state_tuple) #print "Q_values:", Q_values are_the_same = all(x == Q_values[0] for x in Q_values) #print "are_the_same:", are_the_same if are_the_same: return state_tuple[random.choice([0, 1, 2, 3])] #check epsilon greedy if ((self.env.t != 0) and ((self.env.t % self.epsilon) == 0)): return state_tuple[random.choice([0, 1, 2, 3])] max_value = -sys.maxint - 1 #print "state_tuple:", state_tuple for index in range(len(state_tuple)): if state_tuple[index][1] > max_value: max_state_index = index max_value = state_tuple[index][1] new_action_tuple = state_tuple[max_state_index] return new_action_tuple def update(self, t): self.overall_iterations += 1 # Gather inputs self.next_waypoint = self.planner.next_waypoint( ) # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) inputs.update({'next_waypoint': self.next_waypoint}) inputs.update({'deadline': deadline}) # Update state #print "\n\ninputs:", inputs # get only the monitoring states and save them to self.state map(lambda state: self.state.update({state: inputs[str(state)]}), self.state_roots) #self.print_Qtable() # 1) ----- get new state from Q-table (according to your policy) state_string = str(self.state) #print "state_string:", state_string new_action_tuple = self.get_action_from_Qtable(state_string) # 2) ----- Execute action and get reward action = new_action_tuple[0] #action = random.choice(self.actions) #action = self.next_waypoint #print "new_action_tuple:", new_action_tuple reward = self.env.act(self, action) #print "action REWARD: ", reward # 3) ----- Learn policy based on state, action, reward new_state = {} new_inputs = self.env.sense(self) self.next_waypoint = self.planner.next_waypoint( ) # from route planner, also displayed by simulator new_inputs.update({'next_waypoint': self.next_waypoint}) new_inputs.update({'deadline': self.env.get_deadline(self)}) map(lambda state: new_state.update({state: new_inputs[str(state)]}), self.state_roots) #print "new_state:", new_state max_action_tuple = self.get_action_from_Qtable(str(new_state)) Q_hat = (1 - self.alpha) * new_action_tuple[1] + self.alpha * ( reward + self.gamma * max_action_tuple[1]) #print "Q_hat:", Q_hat state_tuples = self.Qtable[state_string] # store new/updated value to Qtable updated_state_tuples = state_tuples i = 0 for s in state_tuples: if s[0] == action: state_tuples[i] = (action, Q_hat) i += 1 #print "updated_state_tuples:", updated_state_tuples self.Qtable[state_string] = updated_state_tuples if self.env.trial_data['success'] == 1: self.total_sucess += 1 self.epsilon += 1 #decay e-greedy implementation #for every 10 correct predictions we decay the exploration rate if (self.total_sucess > 1 and ((self.total_sucess % 10) == 0) and (self.gamma < 1)): self.gamma += 0.1