Exemplo n.º 1
0
class QLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world by using Q-Learning"""

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.possible_actions= Environment.valid_actions
        self.ai = QLearn(epsilon=.05, alpha=0.1, gamma=0.9)        

    def reset(self, destination=None):
        self.planner.route_to(destination)      

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self) 
        inputs = inputs.items()
        deadline = self.env.get_deadline(self)

        self.state = (inputs[0],inputs[1],inputs[3],self.next_waypoint)
        
        action = self.ai.Q_move(self.state)
        
        # Execute action and get reward
        reward = self.env.act(self, action)

        inputs2 = self.env.sense(self) 
        inputs2 = inputs2.items()
        next_state = (inputs2[0],inputs2[1],inputs2[3],self.next_waypoint)

        self.ai.Q_post(self.state, action, next_state, reward)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 2
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        
        # TODO: Select action according to your policy
        action = None

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        print("LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward))  # [debug]
Exemplo n.º 3
0
class QLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world by using Q-Learning"""

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint
        self.learner = QLearn(epsilon=1, alpha=0.3, gamma=0.6)        

    def reset(self, destination=None):
        self.planner.route_to(destination)      

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self) 
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # TODO: Select action according to your policy
        action = self.learner.action(self.state)
        
        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        next_inputs = self.env.sense(self) 
        self.future_next_way_out = self.planner.next_waypoint()
        next_state = (next_inputs['light'], next_inputs['oncoming'], next_inputs['left'], self.future_next_way_out)
        #next_state = (next_inputs[0], next_inputs[1], next_inputs[3], self.planner.next_waypoint())
        self.learner.learn(self.state, action, next_state, reward)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 4
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.q_matrix = {}
        self.alpha = 0.95
        self.gamma = 0.05
        self.explore = 0.05
        self.state = None

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        deadline = self.env.get_deadline(self)

        state = self.observe_state()
        self.state = state
        
        knowledge = self.knowledge_from_state(state)

        # TODO: Select action according to your policy
        if random() < self.explore:
            action = choice(OPTIONS)
        else:
            action = sorted(knowledge.items(), key=lambda x: x[1], reverse=True)[0][0]

        old_value = knowledge[action]

        # Execute action and get reward
        reward = self.env.act(self, action)
        new_state = self.observe_state()
        new_state_knowledge = self.knowledge_from_state(new_state)

        self.q_matrix[state][action] = (1-self.alpha) * old_value +\
                                       self.alpha * (reward + self.gamma * max(new_state_knowledge.values()))
        

        #print "LearningAgent.update(): deadline = {}, state = {}, action = {}, reward = {}".format(deadline, state, action, reward)  # [debug]

    def knowledge_from_state(self, state):
        """
        Gets what we know about the given state
        """
        if not state in self.q_matrix:
            self.q_matrix[state] = {option: 0 for option in OPTIONS}
        return self.q_matrix[state]

    def observe_state(self):
        """
        Observe the current state
        """
        inputs = self.env.sense(self)
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        return (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)
Exemplo n.º 5
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        # location, heading, delta, deadline, oncoming, traffic light state
        inputs['waypoint'] = self.next_waypoint

        self.state = tuple(sorted(inputs.items())) # performs the actual update

        # TODO: Select action according to your policy
        actions = [None,'forward','left','right']
        action = actions[random.randint(0,3)]

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
class BasicAgent(Agent):
    def __init__(self, env):
        super(BasicAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.color = 'red'  # override color

    def reset(self, destination=None):
        self.planner.route_to(destination)  
        
    def update(self, t):
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        action = random.choice(self.env.valid_actions)
        reward = self.env.act(self, action)
        
        light = inputs['light']
        oncoming = inputs['oncoming']
        left = inputs['left']
        right = inputs['right']
       
        #Update the current observed state
        self.state = (light, oncoming, left, right, self.next_waypoint)
        
        print "Basic.update(): next move = {}".format(action)  # [debug]
Exemplo n.º 7
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.alpha = 0.2
        self.Qtable = {}
        

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.reward = 0
        
        
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state

        # Inform Driving Agent
        self.state = (inputs['light'], inputs['oncoming'], 
                      inputs['left'], inputs['right'], self.next_waypoint)
        
        
        # TODO: Select action according to your policy
        valid_actions = [None, 'forward', 'left', 'right']
        
        # Random Pick Actions
        ### action = random.choice(valid_actions)
        
        # Get all Q values for all state
        all_actions = {action: self.Qtable.get((self.state, action), 0)
                        for action in valid_actions}
                           
        # Find action that maximizes Q values
        best_actions = [action for action in valid_actions 
                        if all_actions[action] == max(all_actions.values())]
           
        # Return Best Action
        action = random.choice(best_actions)    
         
        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        # To simplify the problem, I set gamma equals 0.
        self.Qtable[(self.state, action)] = \
        (1 - self.alpha) * self.Qtable.get((self.state, action), 0) + self.alpha * reward
     
           
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 8
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here



    def reset(self, destination=None):
        self.planner.route_to(destination)



    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = inputs['location']
        self.heading=inputs['heading']

        action = random.choice(Environment.valid_actions)
        reward = self.env.act(self, action)
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs,
                                                                                                    action,
                                                                                                    reward)  # [debug]
Exemplo n.º 9
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        #Actions
        self.A = ['forward', 'left', 'right', None] #all the actions available
        self.trial = 0

        #Initialize Q table(light, oncoming, next_waypoint)
        self.Q={}
        for i in ['green', 'red']:  #possible lights
            for j in [None, 'forward', 'left', 'right']: #possible oncoming_agent
                for k in ['forward', 'left', 'right']: #possible next_waypoints
                    self.Q[(i,j,k)] = [1] * len(self.A)


    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (inputs['light'], inputs['oncoming'], self.next_waypoint)
        #Find the max Q value for the current state
        max_Q = self.Q[self.state].index(max(self.Q[self.state]))

        #assign action
        p = random.randrange(0,5)
        epsilon = 1.5 # small probability to act randomly
        if p<epsilon:
            action = random.choice(self.env.valid_actions)
        else:
            action = self.A[max_Q]
        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        gamma = 0.30
        alp_tune =500 # tunning parameter
        alpha = 1/(1.1+self.trial/alp_tune) # decay learning rate
        self.trial = self.trial+1

        #get the next action state Q(s',a')
        next_inputs = self.env.sense(self)
        next_next_waypoint = self.planner.next_waypoint()
        next_state = (next_inputs['light'],next_inputs['oncoming'], next_next_waypoint)

        #Update Q Table
        self.Q[self.state][self.A.index(action)] = \
        (1-alpha)*self.Q[self.state][self.A.index(action)] + \
        (alpha * (reward + gamma * max(self.Q[next_state])))
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, learning_rate=0.6, discount_factor=0.35):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.alpha = learning_rate
        self.gamma = discount_factor
        self.state = None
        self.Q_values = self.initialize_Q_values()
 
    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
       
    def initialize_Q_values(self, val=10.0):
        Q_values = {}
        for waypoint in ['left', 'right', 'forward']:
            for light in ['red', 'green']:
                for action in self.env.valid_actions:
                    Q_values[((waypoint, light), action)] = val
        return Q_values

    def find_max_Q(self, state):
        action = None
        max_Q = 0.0
        for a in self.env.valid_actions:
            Q = self.Q_values[(state, a)]
            if Q > max_Q:
                action = a
                max_Q = Q
        return (max_Q, action)

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (self.next_waypoint, inputs['light'])

        # TODO: Select action according to your policy
        #action = random.choice(self.env.valid_actions) 
        (Q, action) = self.find_max_Q(self.state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        next_waypoint = self.planner.next_waypoint()
        inputs = self.env.sense(self)
        state_prime = (next_waypoint, inputs['light'])
        (Q_prime, action_prime) = self.find_max_Q(state_prime)        
        Q += self.alpha * (reward + self.gamma*Q_prime - Q) 
        self.Q_values[(self.state, action)] = Q

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 11
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        # sets self.env = env, state = None, next_waypoint = None, and a default color
        super(LearningAgent, self).__init__(env)
        self.color = 'red'  # override color
        # simple route planner to get next_waypoint
        self.planner = RoutePlanner(self.env, self)
        # TODO: Initialize any additional variables here

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        # # from route planner, also displayed by simulator
        self.next_waypoint = self.planner.next_waypoint()
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = self.next_waypoint, inputs['light']
        # self.state = self.next_waypoint, (inputs['oncoming'] == None or inputs['light'] == 'green')

        # TODO: Select action according to your policy
        # Define actions
        actions = [None, 'forward', 'left', 'right']

        # QUESTION 1- select random action
        action = random.choice(actions)

        # Execute action and get reward
        reward = self.env.act(self, action)
class LearningAgent(Agent):
    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.next_waypoint = None
        self.total_reward = 0

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.next_waypoint = None

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Select action according to your policy
        loc, hed = self.get_my_location()
        action = self.get_next_waypoint_given_location( loc, hed)
        action_okay = self.check_if_action_is_ok(inputs)
        if not action_okay:
            action = None
        # Execute action and get reward
        reward = self.env.act(self, action)
        self.total_reward += reward

        #TODO: Learn policy based on state, action, reward
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint

        valid_waypoints = ['forward','right','left']
        valid_lights = ['red','green']
        valid_actions = ['forward','right','left', None]

        self.Q_hat={}
        states = [(next_waypoint,light,oncoming,right,left) for next_waypoint in valid_waypoints for light in valid_lights for oncoming in valid_actions for right in valid_actions for left in valid_actions]
        states.append('Destination')
        for state in states:
            self.Q_hat[state] = {}
            for action in valid_actions:
                self.Q_hat[state][action] = random.random()

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        if self.next_waypoint == None:
            self.state = 'Destination'
        else:
            self.state = (self.next_waypoint, inputs['light'],inputs['oncoming'],inputs['right'],inputs['left'])

        # TODO: Select action according to your policy

        valid_actions = ['forward','right','left', None]
        epsilon = 0.1 #0.01

        best_action = max(self.Q_hat[self.state],key=self.Q_hat[self.state].get)
        random_action = choice(valid_actions)
        action = choice([best_action, random_action],p=[1-epsilon,epsilon])

        # Execute action and get reward

        reward = self.env.act(self, action)

        # Learn policy based on state, action, reward

        new_next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        new_inputs = self.env.sense(self)
        new_state = (new_next_waypoint, new_inputs['light'],new_inputs['oncoming'],new_inputs['right'],new_inputs['left'])
        alpha = 0.5 #opt 0.7
        gamma = 0.5 #opt 0.1
        max_Qhat_ahat = max(self.Q_hat[new_state].values())
        self.Q_hat[self.state][action] = (1-alpha)*self.Q_hat[self.state][action]+alpha*(reward+gamma*max_Qhat_ahat)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 14
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.prevReward = 0
        self.prevAction = None

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.prevReward = 0
        self.prevAction = None

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        state = None
        validActions = []
        
        if inputs['light'] == 'red':
            if inputs['oncoming'] != 'left':
                validActions = ['right', None]
        else:
            if inputs['oncoming'] == 'forward':
                validActions = [ 'forward','right']
            else:
                validActions = ['right','forward', 'left']

        # TODO: Select action according to your policy
        if validActions != []:
            action = random.choice(validActions)
        else:
            action = random.choice(Environment.valid_actions)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        if action != None:
            if reward > self.prevReward:
                self.state = (inputs,self.next_waypoint,deadline) 
                self.prevAction = action
                self.prevReward = reward
            else:
                self.state = (inputs,self.next_waypoint,deadline)
                self.prevAction = action
                self.prevReward = reward


        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 15
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        # sets self.env = env, state = None, next_waypoint = None, and a default color
        super(LearningAgent, self).__init__(env)
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.qtable = defaultdict(dict)
        self.epsilon = .4  # Randomness factor
        self.alpha = .4  # Learning rate.

    def reset(self, destination=None):
        # Prepare for a new trip; reset any variables here, if required
        self.planner.route_to(destination)

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # Update state
        self.state = self.define_state(inputs, self.next_waypoint)

        # Select action according policy
        action = self.select_action()

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Learn policy based on state, action, reward
        self.update_qtable(action, reward)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs,
                                                                                                    action,
                                                                                                    reward)  # [debug]

    def select_action(self):
        if random.random() < self.epsilon or self.state not in self.qtable:
            action = random.choice(self.env.valid_actions)
        else:
            action = max(self.qtable[self.state], key=self.qtable[self.state].get)

        if self.state not in self.qtable:
            for a in self.env.valid_actions:  # init action:reward dict.
                self.qtable[self.state][a] = 0
        return action

    def update_qtable(self, action, reward):
        old_q = self.qtable[self.state].get(action, 0)
        self.qtable[self.state][action] = ((1 - self.alpha) * old_q) + (self.alpha * reward)
        # Decay alpha and epsilon until it reaches 0.1 and 0.0, respectively.
        self.alpha = max(0.001, self.alpha - 0.002)
        self.epsilon = max(0.0, self.epsilon - 0.002)

    @staticmethod
    def define_state(inputs, next_waypoint):
        return tuple(inputs.values()) + (next_waypoint, )
Exemplo n.º 16
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Qtable = {}
        self.alpha = 0.8
        self.gamma = 0.6

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (self.next_waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right'])
        old_state = self.state
        
        # TODO: Select action according to your policy
        if (old_state, None) not in self.Qtable.keys():
            self.Qtable[(old_state, None)] = 0
        if (old_state, 'left') not in self.Qtable.keys():
            self.Qtable[(old_state, 'left')] = 0
        if (old_state, 'forward') not in self.Qtable.keys():
            self.Qtable[(old_state, 'forward')] = 0
        if (old_state, 'right') not in self.Qtable.keys():
            self.Qtable[(old_state, 'right')] = 0
        
        max_action = max(self.Qtable[(old_state, None)], self.Qtable[(old_state, 'left')],self.Qtable[(old_state, 'forward')],self.Qtable[(old_state, 'right')])
        action = random.choice([ i for i in ['left', 'forward', 'right', None] if self.Qtable[(old_state, i)] == max_action])

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        #self.next_waypoint = self.planner.next_waypoint()
        inputs = self.env.sense(self)
        self.state = (self.next_waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right'])
        new_state = self.state

        if (new_state, None) not in self.Qtable.keys():
            self.Qtable[(new_state, None)] = 0
        if (new_state, 'left') not in self.Qtable.keys():
            self.Qtable[(new_state, 'left')] = 0
        if (new_state, 'forward') not in self.Qtable.keys():
            self.Qtable[(new_state, 'forward')] = 0
        if (new_state, 'right') not in self.Qtable.keys():
            self.Qtable[(new_state, 'right')] = 0

        self.Qtable[(old_state, action)] = (1 - self.alpha) * self.Qtable[(old_state, action)] + self.alpha *(reward + self.gamma * max(self.Qtable[(new_state, i)] for i in [None, 'left', 'forward', 'right']))
Exemplo n.º 17
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.actions = [None, 'forward', 'left', 'right']
        self.learning_rate = 0.3
        self.state = None
        self.q = {}
        self.trips = 0

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # Prepare for a new trip; reset any variables here, if required
        self.trips += 1
        if self.trips >= 100:
            self.learning_rate = 0

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # Update state
        state = (inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'],
self.next_waypoint)
        self.state = state

        # Lazy initialization of Q-values
        for action in self.actions:
            if (state, action) not in self.q:
                self.q[(state, action)] = 1
        
        # Select action according to the policy

        probabilities = [self.q[(state, None)], self.q[(state, 'forward')], self.q[(state, 'left')], self.q[(state, 'right')]]
        # Use the softmax funtion so that the values actually behave like probabilities
        probabilities = np.exp(probabilities) / np.sum(np.exp(probabilities), axis=0)

        adventurousness = max((100 - self.trips) / 100, 0)
        if random.random() < adventurousness:
            action = np.random.choice(self.actions, p=probabilities)
        else:
            action = self.actions[np.argmax(probabilities)]

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Learn policy based on state, action, reward
        self.q[(state, action)] = self.learning_rate * reward + (1 - self.learning_rate) * self.q[(state, action)]

        #print("LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward))  # [debug]

    def get_state(self):
        return self.state
Exemplo n.º 18
0
class BasicAgent(Agent):
    """A basic agent upon which to build learning agents."""

    def __init__(self, env):
        super(BasicAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.qvals = {} # mapping (state, action) to q-values
        self.time = 0 # number of moves performed
        self.possible_actions = (None, 'forward', 'left', 'right')
        self.n_dest_reached = 0 # number of destinations reached
        self.last_dest_fail = 0 # last time agent failed to reach destination
        self.sum_time_left = 0 # sum of time left upon reaching destination over all trials
        self.n_penalties = 0 # number of penalties incurred
        self.last_penalty = 0 # last trial in which the agent incurred in a penalty

    def reset(self, destination=None):
        self.planner.route_to(destination)

    def best_action(self, state):
        """
        Return a random action (other agents will have different policies)
        """
        return random.choice(self.possible_actions)

    def update_qvals(self, state, action, reward):
        """
        Keeps track of visited (state, action) pairs. 
        (in other agents will use reward to update 
        the mapping from (state, action) pairs to q-values)
        """
        self.qvals[(state, action)] = 0
    
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        
        # update time
        self.time += 1

        # Update state
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'],
                      self.next_waypoint)

        # Pick an action
        action = self.best_action(self.state)
        
        # Execute action and get reward
        reward = self.env.act(self, action)
        if reward < 0:
            self.n_penalties += 1
        
        # Update the q-value of the (state, action) pair
        self.update_qvals(self.state, action, reward)
        
        # print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 19
0
class QLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.total_reward = 0
        self.next_waypoint = None
        self.QLearner = QAlgorithm(epsilon=0.03, alpha = 0.1, gamma = 0.9)
        self.next_state = None
        self.time_step = 1.0
    
    def get_decay_rate(self, t): #Decay rate for epsilon
       return 1.0 / float(t)
       
    def reset(self, destination=None):
        self.planner.route_to(destination)
        self.time_step = 1.0
        # TODO: Prepare for a new trip; reset any variables here, if required
        '''
        self.previous_action = None
        self.next_state = None
        self.total_reward = 0
        self.next_waypoint = None
        self.QLearner = QAlgorithm(epsilon=0.03, alpha = 0.1, gamma = 0.9)
        self.next_state = None
        '''
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        #env_states = self.env.agent_states[self]
        self.time_step += 1
        epsilon = self.get_decay_rate(self.time_step)
        self.QLearner.set_eps(epsilon)
        
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)
        #self.state = (inputs['light'], self.next_waypoint)

        action = self.QLearner.q_move(self.state)        
        
        # Execute action and get reward
        reward = self.env.act(self, action)
        self.total_reward += reward
        
        new_inputs = self.env.sense(self)
        #env_states = self.env.agent_states[self]
        new_deadline = self.env.get_deadline(self)
        self.next_state =  (new_inputs['light'], new_inputs['oncoming'], new_inputs['left'], self.next_waypoint)
        #self.next_state =  (new_inputs['light'], self.next_waypoint)
        
        # TODO: Learn policy based on state, action, reward

        self.QLearner.q_future_reward(self.state, action, self.next_state, reward)
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 20
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, alpha, q_initial):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.policy = {}
        self.trip_log = []
        self.trip = None
        self.alpha = alpha
        self.q_initial = q_initial

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        if self.trip:
            self.trip_log.append(self.trip)
        self.trip = {}
        self.trip['Deadline'] = self.env.get_deadline(self)
        self.trip['Reward'] = 0
        self.trip['Penalty'] = 0

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        inputs.pop('right')
        self.state = inputs
        self.state['waypoint'] = self.next_waypoint

        # TODO: Select action according to your policy
        state_hash = hashabledict(frozenset(self.state.iteritems()))
        q = self.policy.get(state_hash, {
            None: self.q_initial,
            'forward': self.q_initial,
            'left': self.q_initial,
            'right': self.q_initial,
        })
        action = max(q.iteritems(), key=operator.itemgetter(1))[0]

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Update trip stats
        self.trip['Reward'] += reward
        self.trip['Remaining'] = self.env.get_deadline(self)
        self.trip['Success'] = self.planner.next_waypoint() == None
        if reward < 0: self.trip['Penalty'] += reward

        # TODO: Learn policy based on state, action, reward
        q[action] = (1 - self.alpha) * q[action] + self.alpha * reward
        self.policy[state_hash] = q
Exemplo n.º 21
0
class BasicLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(BasicLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.last_reward = 0
        self.last_action = None
        self.last_state = None
        self.state = None
        self.total_reward = 0
        self.deadline = self.env.get_deadline(self)
        self.actions = ['forward', 'left', 'right', None]
        self.reached_destination = 0
        self.penalties = 0
        self.movements = 0

    def tuple_state(self, state):
        State = namedtuple("State", ["light", "next_waypoint"])
        return State(light = state['light'], next_waypoint = self.planner.next_waypoint()) 

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.last_reward = 0
        self.last_action = None
        self.last_state = None
        self.state = None
        self.total_reward = 0

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        
        # TODO: Update state
        self.state = self.tuple_state(inputs)

        # TODO: Select action according to your policy
        action = random.choice(self.actions)

        # Execute action and get reward
        reward = self.env.act(self, action)

        if reward >= 10: self.reached_destination += 1
        if reward < 0: self.penalties += 1
        self.movements += 1

        # TODO: Learn policy based on state, action, reward 
        self.last_action = action
        self.last_state = self.state
        self.last_reward = reward
        self.total_reward += reward
Exemplo n.º 22
0
class QLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Q = {}
        self.learning_rate = 0.9
        self.states = None
        self.possible_actions = [None, 'forward', 'left', 'right']
        

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)



        # TODO: Update state
        # state = (self.next_waypoint,inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'],deadline)
        state = (self.next_waypoint,inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'])
        # state = (self.next_waypoint,inputs['light'])
        self.state = state

        for action in self.possible_actions:
            if(state, action) not in self.Q: 
                self.Q[(state, action)] = 20

        # TODO: Select action according to your policy
        Q_best_value = [self.Q[(state, None)], self.Q[(state, 'forward')], self.Q[(state, 'left')], self.Q[(state, 'right')]]
        # Softmax
        Q_best_value = np.exp(Q_best_value) / np.sum(np.exp(Q_best_value), axis=0)
        
        #action = np.random.choice(self.possible_actions, p=probabilities)
        action = self.possible_actions[np.argmax(Q_best_value)]

        # Execute action and get reward
        reward = self.env.act(self, action)
        if reward == -10.0 :
            print "Invalid move"
        if reward == -0.5 :
            print "valid, but is it correct?"
        

        # TODO: Learn policy based on state, action, reward
        # self.Q[(state,action)] = reward + self.learning_rate * self.Q[(state,action)]
        self.Q[(state,action)] = reward * self.learning_rate + ( 1 - self.learning_rate ) * self.Q[(state,action)]
Exemplo n.º 23
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        
        # TODO: Initialize any additional variables here
        self.acts = ['left', 'right', 'forward', 'stay']
        df_tmp = pd.DataFrame(0*np.ones((2, 4)), index=['red', 'green'], columns=self.acts)
        self.Q = df_tmp
        self.deadline = 0
        

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        self.deadline = deadline

        # TODO: Update state
        self.state = inputs['light'] # in this case the state is simply defined by the color oof the traffic light at the intersection
        
        # TODO: Select action according to your policy
        # Q-Learner
        act_opt = self.Q.ix[self.state].argmax()
        if act_opt=='stay': act_opt=None
#            print "Current state = %s, qlearner action = %s" % (self.state, act_opt)
        action = act_opt

        # Execute action and get reward
        reward = self.env.act(self, action)


        # TODO: Learn policy based on state, action, reward

        # Maximize Q conditional on the state we are in = S
        q_S = self.Q.ix[self.state] # Q conditional on state S
        q_S_mx = q_S[q_S==q_S.max()] # just in case there are multiple values and the action that maximizes Q(S, a) is not unique
        ix_mx = np.random.randint(0,len(q_S_mx)) # pick one of the equally qlearner actions
        q_S_mx = q_S_mx.iloc[ix_mx]

        # Update Q
        act_tmp = action if action !=None else 'stay'
        global alpha, alpha0, gamma
        alpha *= alpha0
        self.Q.ix[self.state, act_tmp] = \
            (1-alpha)*self.Q.ix[self.state, act_tmp] + \
            alpha*(reward + gamma*q_S_mx) # note: we update the action that has been actually taken, but we update the contribution to Q(S, a) by the action that could have been taken had we behaved qlearnerly
Exemplo n.º 24
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.QLearning = QLearning(epsilon=0.3, alpha=0.95, gamma=0.9)
        # self.QLearning = pickle.load(open('./QLearning.pkl'))

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def is_action_okay(self, inputs):
        action_okay = True
        if self.next_waypoint == 'right':
            if inputs['light'] == 'red' and inputs['left'] == 'forward':
                action_okay = False
        elif self.next_waypoint == 'forward':
            if inputs['light'] == 'red':
                action_okay = False
        elif self.next_waypoint == 'left':
            if inputs['light'] == 'red' or (inputs['oncoming'] == 'forward' or inputs['oncoming'] == 'right'):
                action_okay = False

        return action_okay

    def update(self, t):
        # Gather states
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        action_okay = self.is_action_okay(inputs)
        self.state = (action_okay, self.next_waypoint, deadline < 3)
        # self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # TODO: Select action according to your policy
        action = self.QLearning.policy(self.state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Gather next states
        next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        next_inputs = self.env.sense(self)
        next_deadline = self.env.get_deadline(self)
        next_action_okay = self.is_action_okay(next_inputs)
        next_state = (next_action_okay, next_waypoint, next_deadline < 3)
        # next_state = (next_inputs['light'], next_inputs['oncoming'], next_inputs['left'], next_waypoint)

        # TODO: Learn policy based on state, action, reward
        self.QLearning.update_QTable(self.state, action, reward, next_state)
Exemplo n.º 25
0
class RandomAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(RandomAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.availableAction = ['forward', 'left', 'right',None]   
        self.goal=0
        self.steps=0
        self.features=[]
        self.deadline=[]
        self.total_reward=[0]

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        #print"RESET, Final state:\n", self.state
        try:
            if self.deadline[len(self.features)-1] >0: #deadline less than zero
                self.goal+=1 #FIXME - order
                print "PASS! {} steps to goal,Goal reached {} times out of {}!".format(
                                                        self.deadline[len(self.features)-1],self.goal,len(self.features))
            else:
                print "FAIL! {} steps to goal,Goal reached {} times out of {}!".format(
                                                        self.deadline[len(self.features)-1],self.goal,len(self.features))
                pass
        except:
            print "Trial 0 - Goal reached {} times out of {}!".format(self.goal,len(self.features))
            pass
        print "----------------------------------------------------------"
        self.features.append({})
        self.deadline.append(None)
        self.total_reward.append(0)
        self.steps=0

    def update(self, t):
        # Gather inputs
        self.steps+=1
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        #self.deadline[len(self.features)] = self.env.get_deadline(self)
        self.state=inputs
        self.features[len(self.features)-1][self.steps]=inputs
        self.deadline[len(self.deadline)-1] = self.env.get_deadline(self)

        # TODO: Select action according to your policy
        action = self.availableAction[random.randint(0,3)]    
        
        # Execute action and get reward
        reward = self.env.act(self, action)
        self.lastReward=reward
        # TODO: Learn policy based on state, action, reward
        self.total_reward[len(self.total_reward)-1] =self.total_reward[len(self.total_reward)-1]+reward
Exemplo n.º 26
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, learning_rate=0.78, discount_factor=0.36):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Q = {}
        self.discount_factor = discount_factor
        self.learning_rate = learning_rate
        self.actions = Environment.valid_actions
        self.oldAction = None


    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.oldAction=None
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        #------------------------------------------------
        #action = np.random.choice(self.actions)
        #------------------------------------------------
        


        # TODO: Select action according to your policy
        current_state = tuple(inputs.values() + [self.next_waypoint])
        self.state = (inputs, self.next_waypoint)
        
        if current_state not in self.Q:
        	self.Q[current_state] = [3,3,3,3]

        Q_max = self.Q[current_state].index(np.max(self.Q[current_state]))
        # Execute action and get reward
        action = self.actions[Q_max]
        reward = self.env.act(self, action)



        # TODO: Learn policy based on state, action, reward
        
        if t!=0:
			oldvalue = self.Q[self.oldAction[0]][self.oldAction[1]]
			newvalue = ((1 - self.learning_rate)*oldvalue)+(self.learning_rate * (self.oldAction[2] + self.discount_factor * self.Q[current_state][Q_max]))
			self.Q[self.oldAction[0]][self.oldAction[1]] = newvalue 
        
        self.oldAction = (current_state, self.actions.index(action), reward)
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 27
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.debugOn = False
        self.reporter = Reporter(driver_type, self.env.valid_actions)
        if driver_type=='random':
            self.driver = RandomDriver(self.env.valid_actions)
        elif driver_type=='greedy':
            self.driver = GreedyDriver(self.env.valid_actions)
        elif driver_type=='greedy2':
            self.driver = Greedy2Driver(self.env.valid_actions)
        elif driver_type=='smart':
            self.driver = SmartDriver(self.env.valid_actions, self.reporter)
        elif driver_type=='smart2':
            self.driver = Smart2Driver(self.env.valid_actions, self.reporter)
        elif driver_type=='smart3':
            self.driver = Smart3Driver(self.env.valid_actions, self.reporter)
        else:
            self.driver = Smart2Driver(self.env.valid_actions, self.reporter)

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.reporter.reset(destination)

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = State.make(self.next_waypoint, inputs)
        
        # TODO: Select action according to your policy
        action = self.driver.decide(self.state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        newWaypoint = self.planner.next_waypoint()
        newState = State.make(newWaypoint, self.env.sense(self))
        self.driver.learn(t, deadline, self.state, action, reward, newState)

        location = self.env.agent_states[self]['location']
        if self.debugOn:
            print "LearningAgent.update(): deadline={} inputs={} action={} reward={} location={}".format(deadline, inputs, action, reward, location)  # [debug]
        self.reporter.update(t, deadline, self.state, action, reward, location, self.driver)
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, trials):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint

        # TODO: Initialize any additional variables here
        self.trials = trials
        self.success = 0  # How many times agent able to reach the target for given trials?

        self.penalties = 0
        self.total_rewards = 0

    def reset(self, destination=None):
        self.planner.route_to(destination)

        # TODO: Prepare for a new trip; reset any variables here, if required
        self.penalties = 0
        self.total_rewards = 0

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        # self.state = inputs
        # self.state = (inputs['light'], inputs['oncoming'], inputs['left'])
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # TODO: Select action according to your policy
        action = random.choice(self.env.valid_actions)  # Initial random movement / random action

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".\
            format(deadline, inputs, action, reward)  # [debug]

        # Statistics
        self.total_rewards += reward

        if reward < 0:
            self.penalties += 1

        if reward > 5:
            self.success += 1
            print "\nTrial info: Number of Penalties =  {}, Net rewards: {}. \nTotal success/trials: {}/{}. \n".\
                format(self.penalties, self.total_rewards, self.success, self.trials)
Exemplo n.º 29
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here        
        self.actions = ['forward', 'left', 'right',None]        
        self.q_table = Counter()        
        self.alpha = 0.5
        self.gamma = 0.2


    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def get_best_action(self, state):
        best_profit = -999
        best_action = None
        for a in self.actions:
            profit = self.q_table[(state, a)]            
            if profit > best_profit:
                best_profit = profit
                best_action = a
        return best_action, best_profit


    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state        
        self.state = (inputs['light'],inputs['oncoming'], inputs['left'], self.next_waypoint)       

        if randint(0,10) < 3:
            action = self.actions[randint(0, 3)]
        else:        
            action, _ = self.get_best_action(self.state)
        
        reward = self.env.act(self, action)

        next_inputs = self.env.sense(self)
        next_next_waypoint = self.planner.next_waypoint()
        next_state = (next_inputs['light'],next_inputs['oncoming'], next_inputs['left'], next_next_waypoint)      
       
        _, best_profit = self.get_best_action(next_state)
        self.q_table[(self.state, action)] = (1-self.alpha)*self.q_table[(self.state, action)] + (self.alpha * (reward + self.gamma * best_profit))

       
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 30
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.total_reward = 0

        #Q matrix
        self.Q = {}

        # initial Q values
        self.initial_q = 30

        # Initialize state
        self.states = None

        # Dictionary of possible actions to index states from Q matrix
        self.actions = {None : 0, 'forward' : 1, 'left' : 2, 'right' : 3}

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # Check if this state is included in the Q matrix. If not, create a new row in the matrix.
        if self.state not in self.Q:
            self.Q[self.state] = [self.initial_q for i  in range(len(self.env.valid_actions))]

        # TODO: Select action according to your policy
        learning_rate = 0.8
        action = self.env.valid_actions[np.argmax(self.Q[self.state])]
        discount = 0
        # Execute action and get reward
        reward = self.env.act(self, action)
        self.total_reward += reward
        # TODO: Learn policy based on state, action, reward

        # get the future state
        future_state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # Update the Q matrix (add consideration of future reward)
        #self.Q[self.state][self.actions[action]] = self.Q[self.state][self.actions[action]] * (1 - learning_rate) + reward * learning_rate
        self.Q[self.state][self.actions[action]] = self.Q[self.state][self.actions[action]] + learning_rate * (reward + discount * self.Q[future_state][self.actions[action]] - self.Q[self.state][self.actions[action]])
Exemplo n.º 31
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor
        # Set any additional class parameters as needed
        self.trial = 1

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
           'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0

        if testing:
            self.epsilon = 0
            self.alpha = 0

        else:
            #self.epsilon -= 0.05  ##To be used in default learner
            self.epsilon = -math.pow(-0.03 * self.trial, 2) + 1
            self.alpha = -math.pow(-0.02 * self.trial, 2) + 1

        if self.learning:
            self.trial += 1

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        state = (inputs['light'], inputs['oncoming'], waypoint)

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        maxQ = max(self.Q[state].values())

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning:
            if not state in self.Q.keys():
                state_dict = {}
                for action in self.valid_actions:
                    state_dict[action] = 0.0
                self.Q[state] = state_dict

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability

        if (self.learning and
                random.random() < self.epsilon) or state not in self.Q.keys():
            action = random.choice(self.valid_actions)

        else:
            Q_val = self.get_maxQ(state)
            for a in self.valid_actions:
                if self.Q[state][a] == Q_val:
                    action = a
                    break

        #   Otherwise, choose an action with the highest Q-value for the current state

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            current_Q = self.Q[state][action]
            self.Q[state][action] = (
                1 - self.alpha) * current_Q + self.alpha * reward

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Exemplo n.º 32
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.q_values = {}
        self.penalties = []
        self.penalty = 0
    def reset(self, destination=None):
        self.penalties.append(self.penalty)
        self.penalty = 0
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # Update state
        curr_state = (self.next_waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right'])
        self.state = curr_state

        old_q, action = self.choose_best_action(self.state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # calculate penalties
        if reward < 0:
            self.penalty += 1

        # Sense the environment, and obtain new state
        new_inputs = self.env.sense(self)
        new_state = (self.next_waypoint, new_inputs['light'], new_inputs['oncoming'], new_inputs['left'], new_inputs['right'])

        # Update the Q table
        new_q_val = self.calculate_q_val(new_state, old_q, reward)

        self.q_values[(curr_state, action)] = new_q_val

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}, waypoint = {}".format(deadline, inputs, action, reward, self.next_waypoint)  # [debug]

    def get_max_q(self, new_state):
        q = [self.get_q_value(new_state, action) for action in Environment.valid_actions]
        max_q = max(q)
        return max_q, q

    def choose_best_action(self, state):
        max_q, q_lst = self.get_max_q(state)

        # If there are multiple max_q, pick a random one
        if q_lst.count(max_q) > 1:
            max_q_lst = [i for i in range(len(Environment.valid_actions)) if q_lst[i] == max_q]
            idx = random.choice(max_q_lst)
        else:
            idx = q_lst.index(max_q)

        action = Environment.valid_actions[idx]

        return max_q, action

    def get_q_value(self, state, action):
        return self.q_values.get((state, action), 1.0)

    def calculate_q_val(self, new_state, old_q, reward):
        # learning rate
        alpha = 0.5
        # discount factor
        gamma = 0.25
        # learned value
        learned_val = reward + (gamma * self.get_max_q(new_state)[0] - old_q)

        new_q = old_q + alpha * learned_val

        return new_q
Exemplo n.º 33
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        self.state = {}
        self.learning_rate = 0.6
        self.exploration_rate = 0.1
        self.exploration_degradation_rate = 0.001
        self.discount_rate = 0.4
        self.q_values = {}
        self.valid_actions = [None, 'forward', 'left', 'right']

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        self.state = self.build_state(inputs)

        # TODO: Select action according to your policy
        action = self.choose_action_from_policy(self.state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        self.update_q_value(self.state, action, reward)

        #print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]

    def build_state(self, inputs):
        return {
            "light": inputs["light"],
            "oncoming": inputs["oncoming"],
            "left": inputs["left"],
            "direction": self.next_waypoint
        }

    def choose_action_from_policy(self, state):
        if random.random() < self.exploration_rate:
            self.exploration_rate -= self.exploration_degradation_rate
            return random.choice(self.valid_actions)
        best_action = self.valid_actions[0]
        best_value = 0
        for action in self.valid_actions:
            cur_value = self.q_value_for(state, action)
            if cur_value > best_value:
                best_action = action
                best_value = cur_value
            elif cur_value == best_value:
                best_action = random.choice([best_action, action])
        return best_action

    def max_q_value(self, state):
        max_value = None
        for action in self.valid_actions:
            cur_value = self.q_value_for(state, action)
            if max_value is None or cur_value > max_value:
                max_value = cur_value
        return max_value

    def q_value_for(self, state, action):
        q_key = self.q_key_for(state, action)
        if q_key in self.q_values:
            return self.q_values[q_key]
        return 0

    def update_q_value(self, state, action, reward):
        q_key = self.q_key_for(state, action)
        cur_value = self.q_value_for(state, action)
        inputs = self.env.sense(self)
        self.next_waypoint = self.planner.next_waypoint()
        new_state = self.build_state(inputs)
        learned_value = reward + (self.discount_rate *
                                  self.max_q_value(new_state))
        new_q_value = cur_value + (self.learning_rate *
                                   (learned_value - cur_value))
        self.q_values[q_key] = new_q_value

    def q_key_for(self, state, action):
        return "{}|{}|{}|{}|{}".format(state["light"], state["direction"],
                                       state["oncoming"], state["left"],
                                       action)
Exemplo n.º 34
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=0.99, alpha=0.8):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        # self.epsilon=self.epsilon*0.99
        self.epsilon = self.epsilon * 0.99
        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        state = (waypoint, inputs['light'], inputs['left'], inputs['right'],
                 inputs['oncoming'])

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        dic = self.Q[state]
        maxQ = -100000.0
        for key in dic:
            if dic[key] > maxQ:
                maxQ = dic[key]
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        dic = self.Q
        if dic.has_key(state) == False:
            dic[state] = {
                self.valid_actions[0]: 0.0,
                self.valid_actions[1]: 0.0,
                self.valid_actions[2]: 0.0,
                self.valid_actions[3]: 0.0
            }
        self.Q = dic

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        if self.learning == False:

            i = random.randint(0, 3)
            action = self.valid_actions[i]
        elif self.learning == True:
            j = random.random()
            if j < self.epsilon:
                i = random.randint(0, 3)
                action = self.valid_actions[i]
            else:
                dic = self.Q[state]
                action = None
                l1 = list()
                for key in dic:
                    if dic[key] == self.get_maxQ(state):
                        l1.append(key)
                m = len(l1)
                action = l1[random.randint(0, m - 1)]
        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning == True:
            self.Q[state][action] = (
                1 - self.alpha) * self.Q[state][action] + self.alpha * reward
        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Exemplo n.º 35
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """ 

    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent, self).__init__(env)     # Set the agent in the evironment 
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning # Whether the agent is expected to learn
        self.Q = dict()          # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon   # Random exploration factor
        self.alpha = alpha       # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)
        
        ########### 
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing == True:
            self.epsilon = 0
            self.alpha = 0
        else:
            self.epsilon = self.epsilon - 0.002
        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint() # The next waypoint 
        inputs = self.env.sense(self)           # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ########### 
        ## TO DO ##
        ###########
        
        # NOTE : you are not allowed to engineer eatures outside of the inputs available.
        # Because the aim of this project is to teach Reinforcement Learning, we have placed 
        # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation.
        # With the hand-engineered features, this learning process gets entirely negated.
        
        # Set 'state' as a tuple of relevant data for the agent        
        state = (waypoint,inputs['light'],inputs['oncoming'],inputs['left'],inputs['right'])

        return state


    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ########### 
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        stateactions = self.Q[state]
        highq = max(stateactions.iterkeys(), key=(lambda key: stateactions[key]))
        maxQ = stateactions[highq]
        return maxQ 


    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ########### 
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning == True:
            if state not in self.Q:
                actionset = {'left': 0.0, 'right': 0.0, 'forward': 0.0, None : 0.0}
                self.Q[state] = actionset
                    
        return


    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
                    
        ########### 
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie".
        
        if self.learning == False:
            action = random.choice(self.valid_actions)
        else:
            if self.epsilon > random.random():
                action = random.choice(self.valid_actions)
            else:
                maxq = self.get_maxQ(state)
                maxqactions = []
                for action, qvalue in self.Q[state].iteritems():
                    if maxq == self.Q[state][action]:
                        maxqactions.append(action)
                action = random.choice(maxqactions)
        return action


    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives a reward. This function does not consider future rewards 
            when conducting learning. """

        ########### 
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning == True:  
            self.Q[state][action] = self.Q[state][action]*(1.0-self.alpha)+self.alpha*reward

        return


    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()          # Get current state
        self.createQ(state)                 # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action) # Receive a reward
        self.learn(state, action, reward)   # Q-learn

        return
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """ 

    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent, self).__init__(env)     # Set the agent in the evironment 
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning # Whether the agent is expected to learn
        self.Q = dict()          # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon   # Random exploration factor
        self.alpha = alpha       # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.learning = learning # Whether the agent is expected to learn
        self.Q = dict()          # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon   # Random exploration factor
        self.alpha = alpha       # Learning factor

        # Set any additional class parameters as needed
        self.t = 0
        random.seed(1177)

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)
        
        ########### 
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing:
            self.epsilon = 0.0
            self.alpha = 0.0
        else:
            # commented out testing parameters
            #self.epsilon = self.epsilon - 0.05
            self.t += 1.0
            # self.epsilon = 1.0/(self.t**2)
            # self.epsilon = 1.0/(self.t**2 + self.alpha*self.t)
            # self.epsilon = 1.0/(self.t**2 - self.alpha*self.t)
            # self.epsilon = math.fabs(math.cos(self.alpha*self.t))
            # self.epsilon = math.fabs(math.cos(self.alpha*self.t))/(self.t**2)
            # self.epsilon = math.fabs(math.cos(self.alpha*self.t))
            self.epsilon = math.exp(-self.alpha*self.t)
        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint() # The next waypoint 
        inputs = self.env.sense(self)           # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ########### 
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent        
        def xstr(s):
            if s is None:
                return 'None'
            else:
                return str(s)
        
        state = xstr(waypoint) + "_" + inputs['light'] + "_" + xstr(inputs['left']) + "_" +  xstr(inputs['oncoming'])
        if self.learning:
            self.Q[state] = self.Q.get(state, {None:0.0, 'forward':0.0, 'left':0.0, 'right':0.0})
        return state


    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ########### 
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        maxQ = -1000.0
        for action in self.Q[state]:
            if maxQ < self.Q[state][action]:
                maxQ = self.Q[state][action]
        return maxQ


    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ########### 
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning:
            self.Q[state] = self.Q.get(state, {None:0.0, 'forward':0.0, 'left':0.0, 'right':0.0})

        return


    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ########### 
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state
        if not self.learning:
            action = random.choice(self.valid_actions)
        else:
            if self.epsilon > 0.01 and self.epsilon > random.random():
                action = random.choice(self.valid_actions)
            else:
                valid_actions = []
                maxQ = self.get_maxQ(state)
                for act in self.Q[state]:
                    if maxQ == self.Q[state][act]:
                        valid_actions.append(act)
                action = random.choice(valid_actions)
        return action


    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ########### 
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            self.Q[state][action] = self.Q[state][action] + self.alpha*(reward-self.Q[state][action])

        return


    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()          # Get current state
        self.createQ(state)                 # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action) # Receive a reward
        self.learn(state, action, reward)   # Q-learn

        return
Exemplo n.º 37
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.times = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0

        if testing:
            self.epsilon = 0
            self.alpha = 0
        else:
            self.times = self.times + 1
            #self.epsilon = pow(0.996,self.times)
            #self.epsilon = (self.epsilon - 0.05) if (self.epsilon - 0.05) > 0 else 0
            self.epsilon = math.cos(0.0015 * self.times)
        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        state = (waypoint, inputs['light'], inputs['left'], inputs['right'],
                 inputs['oncoming'])

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        maxQ_value = max(self.Q[state].values())
        same_maxQ = []

        ###extract the same value of maxQ into a list
        for key, value in self.Q[state].items():
            if value == maxQ_value:
                same_maxQ.append(key)

        #choose an action randomly from the waiting list
        #maxQ = random.sample(same_maxQ, 1)[0]
        maxQ = random.choice(same_maxQ)

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        #if state not in self.Q:
        #    self.Q[state] ={'left':0.0, 'right':0.0, 'forward':0.0, None:0.0}
        if self.learning:
            self.Q.setdefault(state,
                              {action: 0.0
                               for action in self.valid_actions})

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state
        if self.learning == False:
            action = random.sample(self.valid_actions, 1)[0]
        elif random.random() < self.epsilon:
            action = random.sample(self.valid_actions, 1)[0]
        else:
            action = self.get_maxQ(self.state)

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        self.Q[state][action] = (
            1 - self.alpha) * self.Q[state][action] + self.alpha * reward

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Exemplo n.º 38
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the environment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        self.trial_count = 0  # Use as input for epsilon decay functions

        self.overall_states = 384  # Size of Q-Table = States Combination of all features = 2 * 4 * 4 * 4 * 3
        self.overall_state_action = 384 * 4  # Combination of all possible state + valid actions = 1536
        self.state_action_count = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice

        # Linear: a = 0.05, 20 training trials
        def decay_linear(a):
            self.epsilon -= a

        # Polynomial: a = 20, 20 training trials
        def decay_frac(a):
            self.epsilon = 1.0 / a**2

        # Exponential: a = 0.995, 600 training trials
        def decay_exponential(a):
            self.epsilon = a**self.trial_count

        def decay_exponential_e(a):
            self.epsilon = math.e**(-a * self.trial_count)

        def decay_cosine(a):
            self.epsilon = math.cos(a * self.trial_count)

        # Hold epsilon at 1 during all training trials (in order to explore state-action combination more efficiently)
        def decay_step(total_trials):
            if self.trial_count > total_trials:
                self.epsilon = 0.0

        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0,
        # make sure retrieve from Q-Table instead of random walk.
        if testing:
            self.epsilon = 0.0
            self.alpha = 0.0
        else:
            self.trial_count += 1
            decay_exponential(0.995)
            # decay_linear(0.05)
            # decay_step(600)

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        # 选择5个基础特征,state是这5个特征所有状态的组合,应该一共有384种。(waypoint只会有3种状态)
        state = (inputs['light'], inputs['oncoming'], inputs['left'],
                 inputs['right'], waypoint)

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        maxQ = max(self.Q[state].values())

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning:
            if state not in self.Q:
                self.Q[state] = {
                    None: 0.0,
                    'forward': 0.0,
                    'left': 0.0,
                    'right': 0.0
                }  # Init default Q Score.
        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        # 这里所说的按照epsilon的概率选择任意操作的实际意思是:
        # 首先,你有两个选项,opt1是任选一个操作,opt2一个是选择Q值最高的操作
        # 然后,epsilon的取值范围在[0,1]之内,并且呈现逐渐缩小的趋势,
        # 如果epsilon是0.7,那么opt1的概率就是0.7,opt2的概率就是0.3.
        if not self.learning:
            action = random.choice(
                self.valid_actions
            )  # Pick a random action from All 4 Valid Actions.
        else:
            if random.random() < self.epsilon:
                action = random.choice(
                    self.valid_actions
                )  # Pick a random action from All 4 Valid Actions.
            else:
                # action = max(self.Q[state], key=self.Q[state].get)  # Get Key with max Q Score as action.
                # 需要考虑具有多个最大值的情况,应随机抽取
                max_Q = max(self.Q[state].values())
                max_candidate = [
                    x for x in self.Q[state] if self.Q[state][x] == max_Q
                ]
                action = random.choice(max_candidate)
        """Hard Coded Driving Logic For Fun"""
        # if inputs['light'] == 'red':
        #     if self.next_waypoint == 'right' and inputs['oncoming'] != 'left' and inputs['left'] != 'forward':
        #         action = self.next_waypoint
        #     else:
        #         action = None
        # else:
        #     if self.next_waypoint == 'left' and (inputs['oncoming'] == 'forward' or inputs['oncoming'] == 'right'):
        #         action = 'forward'
        #     else:
        #         action = self.next_waypoint

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        prev_Q = self.Q[state][action]
        self.Q[state][action] = prev_Q * (1 - self.alpha) + reward * self.alpha

        # 显示当前学习进度,包括<状态>的覆盖程度以及<状态-动作>的覆盖程度。
        if prev_Q == 0 and reward != 0:
            self.state_action_count += 1
        print 'Trial Count =', self.trial_count
        print 'Q-Table Size = {} / {}'.format(len(self.Q), self.overall_states)
        print 'Q-Table Non-zero Item Count = {} / {}'.format(
            self.state_action_count, self.overall_state_action)
        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Exemplo n.º 39
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """

    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent, self).__init__(env)     # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning # Whether the agent is expected to learn
        self.Q = dict()          # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon   # Random exploration factor
        self.alpha = alpha       # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.time_step = 1


    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing == True:
            self.alpha = 0
            self.epsilon = 0
        else:
                self.alpha = 0.007
                self.epsilon = math.exp(-self.time_step * self.alpha)
                self.time_step = self.time_step + 1

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the
            environment. The next waypoint, the intersection inputs, and the deadline
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint() # The next waypoint
        inputs = self.env.sense(self)           # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left'])

        return state


    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        st = self.Q[state]
        # adapted from Lucretiel's answer here:  http://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary
        max_key = max(st, key=lambda key: st[key])
        return st[max_key]


    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        if self.learning == True:
            if state not in self.Q.keys():
                self.Q[state] = {None: 0, 'forward': 0, 'left': 0, 'right':0}
        return


    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        if (self.learning == False) or (self.epsilon > random.random()):
            return random.choice(self.valid_actions)
        else:
            max_val = self.get_maxQ(state)

            # to store actions with same max value
            acts = dict()

            for act, reward in self.Q[state].iteritems():
                if reward == max_val:
                    acts[act] = reward
            return random.choice(acts.keys())


    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning == True:
            self.Q[state][action] = (1 - self.alpha) * self.Q[state][action] + self.alpha * reward
        return


    def update(self):
        """ The update function is called when a time step is completed in the
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()          # Get current state
        self.createQ(state)                 # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action) # Receive a reward
        self.learn(state, action, reward)   # Q-learn

        return
Exemplo n.º 40
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.trial = 1
        self.epsilonDecayVariable = epsilon
        self.alphaDecayVariable = alpha

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        #self.epsilon =self.epsilon -.05
        #?=e^ ?at ,for 0<a<1
        #self.epsilon =math.exp(-self.epsilonDecayVariable*self.trial*1)
        #?=cos(at),for 0<a<1
        #self.epsilon =math.cos(self.epsilonDecayVariable*self.trial)
        trialNum = self.trial
        ParmA = -5E-11
        ParmB = 0.104
        ParmC = 210

        self.epsilon = 1 * math.exp(
            -ParmA * trialNum) / (1 + math.exp(ParmB * (trialNum - ParmC)))
        #self.epsilon=self.epsilon -.05
        #self.epsilon=math.exp(-0.0005*self.trial)*(4+math.cos(.7*self.trial))/5
        #=EXP   (-lambda*A1)*(4+COS(freq*A1))/5

        #decay learning rate
        #self.alpha =.75 #math.exp(-self.alphaDecayVariable*self.trial)
        #self.epsilon =math.pow(self.epsilonDecayVariable,self.trial )

        # Update additional class parameters as needed
        #not sure on additional class parms  -time remaining, success rate,etc.. ?

        self.trial = self.trial + 1
        # If 'testing' is True, set epsilon and alpha to 0
        if testing:
            self.epsilon = 0
            self.alpha = 0

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        # When learning, check if the state is in the Q-table
        #   If it is not, create a dictionary in the Q-table for the current 'state'
        #   For each action, set the Q-value for the state-action pair to 0

        state = (inputs['light'], inputs['oncoming'], inputs['right'],
                 inputs['left'], waypoint)

        if self.learning:
            if state not in self.Q:
                #state is in Q table already
                self.Q[state] = {None: 0, 'left': 0, 'right': 0, 'forward': 0}

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########

        #get the highest Q value from the dict of actions for this state: state parm.
        maxQ = max(self.Q[state].values())
        #maxQ = None

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning:
            if state not in self.Q:
                self.Q[state] = {None: 0, 'left': 0, 'right': 0, 'forward': 0}

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        #action = random.choice(self.valid_actions)

        #print(action)
        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        #get max q value
        maxQ = self.get_maxQ(state)

        #initially choose action with highest Q Value
        dictActions = self.Q[state]
        action = dictActions.keys()[dictActions.values().index(maxQ)]

        if self.learning:
            #if the random choice is within epsilon prob, then choose randomly.
            lDblRand = random.random()
            #is the random less than epsilon ?
            if lDblRand < self.epsilon:
                action = random.choice(self.valid_actions)
        else:
            print('not learning')

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            self.Q[state][action] = (
                1 - self.alpha) * self.Q[state][action] + (reward * self.alpha)

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Exemplo n.º 41
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self,
                 env,
                 learning=False,
                 epsilon=1.0,
                 alpha=0.5,
                 epsilon_threshold=0.005,
                 a=0.5,
                 experiment="na"):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.epsilon_threshold = epsilon_threshold
        self.t = 0
        self.a = a
        self.experiment = experiment

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing:
            self.epsilon = 0
            self.alpha = 0
        else:
            if self.experiment == "2":
                self.epsilon = self.a**self.t
            elif self.experiment == "3":
                if self.t == 0:
                    self.t = 1.0
                self.epsilon = 1.0 / (self.t**2)
            elif self.experiment == "4":
                self.epsilon = math.e**-(self.a * self.t)
            elif self.experiment == "5":
                self.epsilon = math.cos(self.a * self.t)
            else:
                self.epsilon = self.epsilon - self.epsilon_threshold

            self.t = self.t + 1.0

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########

        # NOTE : you are not allowed to engineer eatures outside of the inputs available.
        # Because the aim of this project is to teach Reinforcement Learning, we have placed
        # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation.
        # With the hand-engineered features, this learning process gets entirely negated.

        # Set 'state' as a tuple of relevant data for the agent
        state = (waypoint, inputs)

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        maxQ = max(self.Q[self.key_state(state)].values())

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        if self.learning:
            if self.Q.get(self.key_state(state)) == None:
                actions_ = {}
                for a in self.valid_actions:
                    actions_[a] = 0.0
                self.Q[self.key_state(state)] = actions_

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie".
        if not self.learning:
            idx = random.randint(0, len(self.valid_actions) - 1)
            action = self.valid_actions[idx]
            return action

        if self.epsilon > 0.01 and self.epsilon > random.random():
            action = self.valid_actions[random.randint(
                0,
                len(self.valid_actions) - 1)]
        else:
            max_value = self.get_maxQ(state)
            actions_ = [
                action_
                for (action_,
                     value_) in self.Q[self.key_state(state)].iteritems()
                if value_ == max_value
            ]
            idx = random.randint(0, len(actions_) - 1)
            action = actions_[idx]

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives a reward. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')

        if self.learning:
            s_a = self.Q[self.key_state(state)][action]
            self.Q[self.key_state(
                state)][action] = (1 - self.alpha) * s_a + self.alpha * reward
        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return

    def key_state(self, state):
        """ It receives an state and returns a string that serves as
            key for the Q table """

        return "{0}_LI:{1}_O:{2}_R:{3}_L:{4}".format(state[0],
                                                     state[1]['light'],
                                                     state[1]['oncoming'],
                                                     state[1]['right'],
                                                     state[1]['left'])
Exemplo n.º 42
0
class QLearningAgent(Agent):
    # An agent using Q learning learns to drive

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)
        self.color = 'red'
        self.planner = RoutePlanner(self.env, self)
        self.deadline = self.env.get_deadline(self)
        self.next_waypoint = None
        self.moves = 0

        self.qDict = dict()
        self.alpha = 0.9  # learning rate
        self.epsilon = 0.05  # probability of flipping the coin
        self.gamma = 0.2

        self.state = None
        self.new_state = None

        self.reward = None
        self.cum_reward = 0

        self.possible_actions = Environment.valid_actions
        self.action = None

    def reset(self, destination=None):
        self.planner.route_to(destination)
        self.next_waypoint = None
        self.moves = 0

        self.state = None
        self.new_state = None

        self.reward = 0
        self.cum_reward = 0

        self.action = None

    def getQvalue(self, state, action):
        key = (state, action)
        return self.qDict.get(key, 5.0)

    def getMaxQ(self, state):
        q = [self.getQvalue(state, a) for a in self.possible_actions]
        return max(q)

    def get_action(self, state):
        """
		epsilon-greedy approach to choose action given the state 
		"""
        if random.random() < self.epsilon:
            action = random.choice(self.possible_actions)
        else:
            q = [self.getQvalue(state, a) for a in self.possible_actions]
            if q.count(max(q)) > 1:
                best_actions = [
                    i for i in range(len(self.possible_actions))
                    if q[i] == max(q)
                ]
                index = random.choice(best_actions)

            else:
                index = q.index(max(q))
            action = self.possible_actions[index]

        return action

    def qlearning(self, state, action, nextState, reward):
        """
		use Qlearning algorithm to update q values
		"""
        key = (state, action)
        if (key not in self.qDict):
            # initialize the q values
            self.qDict[key] = 5.0
        else:
            self.qDict[key] = self.qDict[key] + self.alpha * (
                reward + self.gamma * self.getMaxQ(nextState) -
                self.qDict[key])

    def update(self, t):
        self.next_waypoint = self.planner.next_waypoint()
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # take the following variables as states:
        # 	- environ(oncoming, left, right)
        #	- next_waypoint
        #	- traffic light
        self.new_state = inputs
        self.new_state['next_waypoint'] = self.next_waypoint
        self.new_state = tuple(sorted(self.new_state.items()))

        # for the current state, choose an action based on epsilon policy
        action = self.get_action(self.new_state)
        # observe the reward
        new_reward = self.env.act(self, action)
        # update q value based on q learning algorithm
        if self.reward != None:
            self.qlearning(self.state, self.action, self.new_state,
                           self.reward)
        # set the state to the new state
        self.action = action
        self.state = self.new_state
        self.reward = new_reward
        self.cum_reward = self.cum_reward + new_reward
        self.moves = self.moves + 1
Exemplo n.º 43
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.trial_num = 1

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        #self.epsilon = self.epsilon - 0.005
        #print "self.trial_num",[self.epsilon,self.trial_num,math.log(100.0/(self.trial_num+1)),math.log(100.0/self.trial_num)]
        if self.epsilon > 0.0:
            nr = 10000.0
            self.epsilon = self.epsilon * math.log(
                nr / (self.trial_num + 1)) / math.log(nr / self.trial_num)
        else:
            self.epsilon = 0.0
        """
        if self.alpha>0.3:
            self.alpha=self.alpha-0.05
        else:
            self.alpha=0.3
            
        """
        self.alpha = self.epsilon + 0.2
        if self.alpha > 1.0:
            self.alpha = 1.0
        self.trial_num = self.trial_num + 1
        if testing == True:
            self.epsilon = 0.0
            self.alpha = 0.0
        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########

        # NOTE : you are not allowed to engineer eatures outside of the inputs available.
        # Because the aim of this project is to teach Reinforcement Learning, we have placed
        # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation.
        # With the hand-engineered features, this learning process gets entirely negated.

        # Set 'state' as a tuple of relevant data for the agent
        #state = None
        #state={'waypoint':waypoint,'light':inputs['light'],'oncoming':inputs['oncoming'], 'left':inputs['left'] }
        state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left'],
                 inputs['right'])

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        maxQ = None
        if state in Q:
            actions_dict = Q[state]
        """
        for action, value in actions_dict.iteritems():
                if maxQ==None:
                    maxQ=(value,action)
                elif value>maxQ[0]:
                    maxQ=(value,action)
                elif value<maxQ[0]:
                    pass
                elif value==maxQ[0]:
                    maxQ.add(action)
        """
        for action, value in actions_dict.iteritems():
            if not maxQ:
                maxQ = value
            elif maxQ < value:
                maxQ = value
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if state not in self.Q:
            self.Q[state] = {
                "left": 0.0,
                "right": 0.0,
                "forward": 0.0,
                None: 0.0
            }
            self.Q[state][state[0]] = 1.0
        return self.Q

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None
        """action_id = random.randint(0,3)
        action  = self.valid_actions[action_id]
        current_input = self.env.sense(self)
        light_state = current_input['light']
        action =  self.next_waypoint
        print {'light_state':light_state,'self.next_waypoint':self.next_waypoint,'action':action, 'left':current_input['left'] , 'oncoming':current_input['oncoming'],'right':current_input['right']}
        if light_state != 'green':
            if self.next_waypoint != 'right':
                action = None
            else:
                if current_input['left'] == 'forward' or current_input['oncoming'] == 'left':
                    action = None
                else:
                    action =self.next_waypoint
        if light_state == 'green':
            if self.next_waypoint != 'left':
                action = self.next_waypoint
            elif current_input['oncoming']=='forward':
                action = None
#        action_id = random.randint(0,3)
#        action  = self.valid_actions[action_id]
#        print "action:",action,action_id
        """
        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie".
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(self.env.valid_actions)
        if not self.learning:
            return random.choice(self.env.valid_actions)

        maxQ = None
        for action, value in self.Q[state].iteritems():
            if not maxQ:
                maxQ = [value, action]
            elif maxQ[0] == value:
                maxQ.append(action)
            elif maxQ[0] < value:
                maxQ = [value, action]

        action_to_return = random.choice(maxQ[1:])
        """if action_to_return==None:
            action_to_return = None"""
        return action_to_return

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives a reward. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            self.Q[state][action] = self.alpha * reward + (
                1 - self.alpha) * self.Q[state][action]
        return True

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        #print "taking action:",action,type(action)
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Exemplo n.º 44
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Q = {}
        self.policy = {}
        self.alpha = 0.4
        self.gamma = 0.3
        self.actions = [None, 'forward', 'right', 'left']

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        # The states can be defined as what the current environment is, .i.e., the traffic light and the intersection situation +
        # the location of the next waypoint.(Hardcoding states is kind of bad idea as they are many of them)
        self.state = inputs
        self.state['next_waypoint'] = self.next_waypoint
        self.state = tuple(self.state.items())
        # select random action if state not present in the policy: Agent has not seen the state

        # TODO: Select action according to your policy
        #print type(self.policy)
        action = self.actions[random.randrange(4)] if self.state not in self.policy else self.policy[self.state]
        reward = self.env.act(self, action)
        state_prime = self.env.sense(self)
        state_prime['next_waypoint'] = self.planner.next_waypoint()
        state_prime = tuple(state_prime.items())
        
        # Now we have all the variables that are input to q learning formula: state, reward, action, and next_state:
        # What goes next? We tweak q_value for the present state action pair and then choose the action of the state which maximized q(the optimal policy)
        # According to the formula we need to calculate the utility of the next state. max(a) Q(s', a'). The policy is holding argmax(a) Q(s,a) values.
        # Thus, as the Q learning algorithm converges, we would be finding he max(a) Q(s', a') if the policy has seen that state. Thus, there is no need to
        # calculate the max(a') Q(s', a') everytime as that would be recursively going on till i don't know what
        
        action_prime  = self.actions[random.randrange(4)] if state_prime not in self.policy else self.policy[state_prime]

        # TODO: Learn policy based on state, action, reward
        # Q_learning
        state_act_pair = (self.state, action)
        self.Q[state_act_pair] = (1 - self.alpha) * self.Q.get(state_act_pair, 0) + self.alpha*(reward + self.gamma*(self.Q.get((state_prime, action_prime), 0)))
        # Now we need to update the policy well(Take action that maximizes the utility of the present state, thus updating policy)
        Q_max_candidates = [self.Q.get((self.state, act)) for act in self.actions]
        try:
            if None not in Q_max_candidates:
                # PERFORMS VERY POORLY WHEN ALL STATES NOT ALREADY SEEN.(11/100) THIS IS SO BECAUSE THE POLICY SHOULD NOT BE UPDATED 
                # UNTIL ALL THE ACTIONS CORRESPONDING TO THE STATE HAS BEEN SEEN. OR ELSE IT WILL KEEP FOLLOWING THE SAME POLICY. AND 
                # WILL NEVER FURTHER LEARN.
                # ANOTHER METHOD IS THE EXPLORATION-EXPLOITATION METHOD WHEREBY THE ACTION IN THE POLICY IS TAKEN WITH PROB 1-e AND ANOTHER
                # RANDOM ACTION WITH PROB e. 
                Q_max = max(Q_max_candidates)
                actions_candidates = [act for act in self.actions if Q_max == self.Q.get((self.state, act))]
                self.policy[self.state]= random.choice(actions_candidates)
        except Exception as e:
            print "Could not update the policy by choosing the max Q: {}".format(e)
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 45
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.state = None
        self.q_table = {}
        self.frame = 0
        self.last_state_tuple = None
        self.last_action = None
        self.alpha = .5

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.last_state_tuple = None
        self.last_action = None

    def update(self, t):

        self.frame += 1

        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  #e.g. left, forward, right
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        env_state = self.env.sense(self)
        state_tuple = (env_state['light'], self.next_waypoint)
        self.state = "Light: %s, Waypoint: %s" % state_tuple

        # TODO: Select action according to your policy
        # TODO: Learn policy based on state, action, reward

        #default to random action
        action = random.choice([None, 'forward', 'left', 'right'])

        #look for action prescribed by q table
        if state_tuple in self.q_table:

            #calculate epsilon
            epsilon = 1 - (((1. / (n_trials * 30.)) * self.frame) + .5)
            if epsilon >= 1.:
                epsilon = .99
            print "EP: ", epsilon

            #randomly decide whether to act randomly, based on epsilon
            roll = random.random()
            if roll < epsilon:
                reward = self.env.act(self, action)
                self.q_table[state_tuple].append((action, reward))
            else:
                #act according to highest recorded q value for the state
                q_temp = 0
                for i in self.q_table[state_tuple]:
                    val = i[1]
                    if val > q_temp:
                        q_temp = val
                        action = i[0]
                reward = self.env.act(self, action)

        #if action not in q table:
        else:
            reward = self.env.act(self, action)

            #q value for the state is "initialized" during the simulation (i.e. when the state/action pair is first encountered).
            #It is initialized to the immediate reward.
            self.q_table[state_tuple] = [(action, reward)]

        #update q value for last state
        if self.last_state_tuple != None:
            c = 0
            for i in self.q_table[self.last_state_tuple]:
                if i[0] == self.last_action:
                    last_act = self.q_table[self.last_state_tuple][c][0]
                    last_val = self.q_table[self.last_state_tuple][c][1]
                    updated_val = (last_val *
                                   (1 - self.alpha)) + (self.alpha * reward)

                    self.q_table[self.last_state_tuple][c] = (last_act,
                                                              updated_val)

        #save last state_tuple, so it can be updated during the next iteration
        self.last_state_tuple = state_tuple
        self.last_action = action

        print "FRAME: ", self.frame
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(
            deadline, env_state, action, reward)  # [debug]