Exemplo n.º 1
0
class QLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world by using Q-Learning"""

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint
        self.learner = QLearn(epsilon=1, alpha=0.3, gamma=0.6)        

    def reset(self, destination=None):
        self.planner.route_to(destination)      

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self) 
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # TODO: Select action according to your policy
        action = self.learner.action(self.state)
        
        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        next_inputs = self.env.sense(self) 
        self.future_next_way_out = self.planner.next_waypoint()
        next_state = (next_inputs['light'], next_inputs['oncoming'], next_inputs['left'], self.future_next_way_out)
        #next_state = (next_inputs[0], next_inputs[1], next_inputs[3], self.planner.next_waypoint())
        self.learner.learn(self.state, action, next_state, reward)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 2
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.alpha = 0.2
        self.Qtable = {}
        

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.reward = 0
        
        
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state

        # Inform Driving Agent
        self.state = (inputs['light'], inputs['oncoming'], 
                      inputs['left'], inputs['right'], self.next_waypoint)
        
        
        # TODO: Select action according to your policy
        valid_actions = [None, 'forward', 'left', 'right']
        
        # Random Pick Actions
        ### action = random.choice(valid_actions)
        
        # Get all Q values for all state
        all_actions = {action: self.Qtable.get((self.state, action), 0)
                        for action in valid_actions}
                           
        # Find action that maximizes Q values
        best_actions = [action for action in valid_actions 
                        if all_actions[action] == max(all_actions.values())]
           
        # Return Best Action
        action = random.choice(best_actions)    
         
        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        # To simplify the problem, I set gamma equals 0.
        self.Qtable[(self.state, action)] = \
        (1 - self.alpha) * self.Qtable.get((self.state, action), 0) + self.alpha * reward
     
           
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
class BasicAgent(Agent):
    def __init__(self, env):
        super(BasicAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.color = 'red'  # override color

    def reset(self, destination=None):
        self.planner.route_to(destination)  
        
    def update(self, t):
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        action = random.choice(self.env.valid_actions)
        reward = self.env.act(self, action)
        
        light = inputs['light']
        oncoming = inputs['oncoming']
        left = inputs['left']
        right = inputs['right']
       
        #Update the current observed state
        self.state = (light, oncoming, left, right, self.next_waypoint)
        
        print "Basic.update(): next move = {}".format(action)  # [debug]
Exemplo n.º 4
0
class QLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world by using Q-Learning"""

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.possible_actions= Environment.valid_actions
        self.ai = QLearn(epsilon=.05, alpha=0.1, gamma=0.9)        

    def reset(self, destination=None):
        self.planner.route_to(destination)      

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self) 
        inputs = inputs.items()
        deadline = self.env.get_deadline(self)

        self.state = (inputs[0],inputs[1],inputs[3],self.next_waypoint)
        
        action = self.ai.Q_move(self.state)
        
        # Execute action and get reward
        reward = self.env.act(self, action)

        inputs2 = self.env.sense(self) 
        inputs2 = inputs2.items()
        next_state = (inputs2[0],inputs2[1],inputs2[3],self.next_waypoint)

        self.ai.Q_post(self.state, action, next_state, reward)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, learning_rate=0.6, discount_factor=0.35):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.alpha = learning_rate
        self.gamma = discount_factor
        self.state = None
        self.Q_values = self.initialize_Q_values()
 
    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
       
    def initialize_Q_values(self, val=10.0):
        Q_values = {}
        for waypoint in ['left', 'right', 'forward']:
            for light in ['red', 'green']:
                for action in self.env.valid_actions:
                    Q_values[((waypoint, light), action)] = val
        return Q_values

    def find_max_Q(self, state):
        action = None
        max_Q = 0.0
        for a in self.env.valid_actions:
            Q = self.Q_values[(state, a)]
            if Q > max_Q:
                action = a
                max_Q = Q
        return (max_Q, action)

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (self.next_waypoint, inputs['light'])

        # TODO: Select action according to your policy
        #action = random.choice(self.env.valid_actions) 
        (Q, action) = self.find_max_Q(self.state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        next_waypoint = self.planner.next_waypoint()
        inputs = self.env.sense(self)
        state_prime = (next_waypoint, inputs['light'])
        (Q_prime, action_prime) = self.find_max_Q(state_prime)        
        Q += self.alpha * (reward + self.gamma*Q_prime - Q) 
        self.Q_values[(self.state, action)] = Q

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 6
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here



    def reset(self, destination=None):
        self.planner.route_to(destination)



    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = inputs['location']
        self.heading=inputs['heading']

        action = random.choice(Environment.valid_actions)
        reward = self.env.act(self, action)
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs,
                                                                                                    action,
                                                                                                    reward)  # [debug]
Exemplo n.º 7
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        # location, heading, delta, deadline, oncoming, traffic light state
        inputs['waypoint'] = self.next_waypoint

        self.state = tuple(sorted(inputs.items())) # performs the actual update

        # TODO: Select action according to your policy
        actions = [None,'forward','left','right']
        action = actions[random.randint(0,3)]

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 8
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        #Actions
        self.A = ['forward', 'left', 'right', None] #all the actions available
        self.trial = 0

        #Initialize Q table(light, oncoming, next_waypoint)
        self.Q={}
        for i in ['green', 'red']:  #possible lights
            for j in [None, 'forward', 'left', 'right']: #possible oncoming_agent
                for k in ['forward', 'left', 'right']: #possible next_waypoints
                    self.Q[(i,j,k)] = [1] * len(self.A)


    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (inputs['light'], inputs['oncoming'], self.next_waypoint)
        #Find the max Q value for the current state
        max_Q = self.Q[self.state].index(max(self.Q[self.state]))

        #assign action
        p = random.randrange(0,5)
        epsilon = 1.5 # small probability to act randomly
        if p<epsilon:
            action = random.choice(self.env.valid_actions)
        else:
            action = self.A[max_Q]
        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        gamma = 0.30
        alp_tune =500 # tunning parameter
        alpha = 1/(1.1+self.trial/alp_tune) # decay learning rate
        self.trial = self.trial+1

        #get the next action state Q(s',a')
        next_inputs = self.env.sense(self)
        next_next_waypoint = self.planner.next_waypoint()
        next_state = (next_inputs['light'],next_inputs['oncoming'], next_next_waypoint)

        #Update Q Table
        self.Q[self.state][self.A.index(action)] = \
        (1-alpha)*self.Q[self.state][self.A.index(action)] + \
        (alpha * (reward + gamma * max(self.Q[next_state])))
Exemplo n.º 9
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        # sets self.env = env, state = None, next_waypoint = None, and a default color
        super(LearningAgent, self).__init__(env)
        self.color = 'red'  # override color
        # simple route planner to get next_waypoint
        self.planner = RoutePlanner(self.env, self)
        # TODO: Initialize any additional variables here

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        # # from route planner, also displayed by simulator
        self.next_waypoint = self.planner.next_waypoint()
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = self.next_waypoint, inputs['light']
        # self.state = self.next_waypoint, (inputs['oncoming'] == None or inputs['light'] == 'green')

        # TODO: Select action according to your policy
        # Define actions
        actions = [None, 'forward', 'left', 'right']

        # QUESTION 1- select random action
        action = random.choice(actions)

        # Execute action and get reward
        reward = self.env.act(self, action)
class LearningAgent(Agent):
    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.next_waypoint = None
        self.total_reward = 0

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.next_waypoint = None

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Select action according to your policy
        loc, hed = self.get_my_location()
        action = self.get_next_waypoint_given_location( loc, hed)
        action_okay = self.check_if_action_is_ok(inputs)
        if not action_okay:
            action = None
        # Execute action and get reward
        reward = self.env.act(self, action)
        self.total_reward += reward

        #TODO: Learn policy based on state, action, reward
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 11
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.q_matrix = {}
        self.alpha = 0.95
        self.gamma = 0.05
        self.explore = 0.05
        self.state = None

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        deadline = self.env.get_deadline(self)

        state = self.observe_state()
        self.state = state
        
        knowledge = self.knowledge_from_state(state)

        # TODO: Select action according to your policy
        if random() < self.explore:
            action = choice(OPTIONS)
        else:
            action = sorted(knowledge.items(), key=lambda x: x[1], reverse=True)[0][0]

        old_value = knowledge[action]

        # Execute action and get reward
        reward = self.env.act(self, action)
        new_state = self.observe_state()
        new_state_knowledge = self.knowledge_from_state(new_state)

        self.q_matrix[state][action] = (1-self.alpha) * old_value +\
                                       self.alpha * (reward + self.gamma * max(new_state_knowledge.values()))
        

        #print "LearningAgent.update(): deadline = {}, state = {}, action = {}, reward = {}".format(deadline, state, action, reward)  # [debug]

    def knowledge_from_state(self, state):
        """
        Gets what we know about the given state
        """
        if not state in self.q_matrix:
            self.q_matrix[state] = {option: 0 for option in OPTIONS}
        return self.q_matrix[state]

    def observe_state(self):
        """
        Observe the current state
        """
        inputs = self.env.sense(self)
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        return (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)
Exemplo n.º 12
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        
        # TODO: Select action according to your policy
        action = None

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        print("LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward))  # [debug]
Exemplo n.º 13
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.prevReward = 0
        self.prevAction = None

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.prevReward = 0
        self.prevAction = None

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        state = None
        validActions = []
        
        if inputs['light'] == 'red':
            if inputs['oncoming'] != 'left':
                validActions = ['right', None]
        else:
            if inputs['oncoming'] == 'forward':
                validActions = [ 'forward','right']
            else:
                validActions = ['right','forward', 'left']

        # TODO: Select action according to your policy
        if validActions != []:
            action = random.choice(validActions)
        else:
            action = random.choice(Environment.valid_actions)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        if action != None:
            if reward > self.prevReward:
                self.state = (inputs,self.next_waypoint,deadline) 
                self.prevAction = action
                self.prevReward = reward
            else:
                self.state = (inputs,self.next_waypoint,deadline)
                self.prevAction = action
                self.prevReward = reward


        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 14
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        # sets self.env = env, state = None, next_waypoint = None, and a default color
        super(LearningAgent, self).__init__(env)
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.qtable = defaultdict(dict)
        self.epsilon = .4  # Randomness factor
        self.alpha = .4  # Learning rate.

    def reset(self, destination=None):
        # Prepare for a new trip; reset any variables here, if required
        self.planner.route_to(destination)

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # Update state
        self.state = self.define_state(inputs, self.next_waypoint)

        # Select action according policy
        action = self.select_action()

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Learn policy based on state, action, reward
        self.update_qtable(action, reward)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs,
                                                                                                    action,
                                                                                                    reward)  # [debug]

    def select_action(self):
        if random.random() < self.epsilon or self.state not in self.qtable:
            action = random.choice(self.env.valid_actions)
        else:
            action = max(self.qtable[self.state], key=self.qtable[self.state].get)

        if self.state not in self.qtable:
            for a in self.env.valid_actions:  # init action:reward dict.
                self.qtable[self.state][a] = 0
        return action

    def update_qtable(self, action, reward):
        old_q = self.qtable[self.state].get(action, 0)
        self.qtable[self.state][action] = ((1 - self.alpha) * old_q) + (self.alpha * reward)
        # Decay alpha and epsilon until it reaches 0.1 and 0.0, respectively.
        self.alpha = max(0.001, self.alpha - 0.002)
        self.epsilon = max(0.0, self.epsilon - 0.002)

    @staticmethod
    def define_state(inputs, next_waypoint):
        return tuple(inputs.values()) + (next_waypoint, )
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint

        valid_waypoints = ['forward','right','left']
        valid_lights = ['red','green']
        valid_actions = ['forward','right','left', None]

        self.Q_hat={}
        states = [(next_waypoint,light,oncoming,right,left) for next_waypoint in valid_waypoints for light in valid_lights for oncoming in valid_actions for right in valid_actions for left in valid_actions]
        states.append('Destination')
        for state in states:
            self.Q_hat[state] = {}
            for action in valid_actions:
                self.Q_hat[state][action] = random.random()

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        if self.next_waypoint == None:
            self.state = 'Destination'
        else:
            self.state = (self.next_waypoint, inputs['light'],inputs['oncoming'],inputs['right'],inputs['left'])

        # TODO: Select action according to your policy

        valid_actions = ['forward','right','left', None]
        epsilon = 0.1 #0.01

        best_action = max(self.Q_hat[self.state],key=self.Q_hat[self.state].get)
        random_action = choice(valid_actions)
        action = choice([best_action, random_action],p=[1-epsilon,epsilon])

        # Execute action and get reward

        reward = self.env.act(self, action)

        # Learn policy based on state, action, reward

        new_next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        new_inputs = self.env.sense(self)
        new_state = (new_next_waypoint, new_inputs['light'],new_inputs['oncoming'],new_inputs['right'],new_inputs['left'])
        alpha = 0.5 #opt 0.7
        gamma = 0.5 #opt 0.1
        max_Qhat_ahat = max(self.Q_hat[new_state].values())
        self.Q_hat[self.state][action] = (1-alpha)*self.Q_hat[self.state][action]+alpha*(reward+gamma*max_Qhat_ahat)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 16
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.actions = [None, 'forward', 'left', 'right']
        self.learning_rate = 0.3
        self.state = None
        self.q = {}
        self.trips = 0

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # Prepare for a new trip; reset any variables here, if required
        self.trips += 1
        if self.trips >= 100:
            self.learning_rate = 0

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # Update state
        state = (inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'],
self.next_waypoint)
        self.state = state

        # Lazy initialization of Q-values
        for action in self.actions:
            if (state, action) not in self.q:
                self.q[(state, action)] = 1
        
        # Select action according to the policy

        probabilities = [self.q[(state, None)], self.q[(state, 'forward')], self.q[(state, 'left')], self.q[(state, 'right')]]
        # Use the softmax funtion so that the values actually behave like probabilities
        probabilities = np.exp(probabilities) / np.sum(np.exp(probabilities), axis=0)

        adventurousness = max((100 - self.trips) / 100, 0)
        if random.random() < adventurousness:
            action = np.random.choice(self.actions, p=probabilities)
        else:
            action = self.actions[np.argmax(probabilities)]

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Learn policy based on state, action, reward
        self.q[(state, action)] = self.learning_rate * reward + (1 - self.learning_rate) * self.q[(state, action)]

        #print("LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward))  # [debug]

    def get_state(self):
        return self.state
Exemplo n.º 17
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Qtable = {}
        self.alpha = 0.8
        self.gamma = 0.6

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (self.next_waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right'])
        old_state = self.state
        
        # TODO: Select action according to your policy
        if (old_state, None) not in self.Qtable.keys():
            self.Qtable[(old_state, None)] = 0
        if (old_state, 'left') not in self.Qtable.keys():
            self.Qtable[(old_state, 'left')] = 0
        if (old_state, 'forward') not in self.Qtable.keys():
            self.Qtable[(old_state, 'forward')] = 0
        if (old_state, 'right') not in self.Qtable.keys():
            self.Qtable[(old_state, 'right')] = 0
        
        max_action = max(self.Qtable[(old_state, None)], self.Qtable[(old_state, 'left')],self.Qtable[(old_state, 'forward')],self.Qtable[(old_state, 'right')])
        action = random.choice([ i for i in ['left', 'forward', 'right', None] if self.Qtable[(old_state, i)] == max_action])

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        #self.next_waypoint = self.planner.next_waypoint()
        inputs = self.env.sense(self)
        self.state = (self.next_waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right'])
        new_state = self.state

        if (new_state, None) not in self.Qtable.keys():
            self.Qtable[(new_state, None)] = 0
        if (new_state, 'left') not in self.Qtable.keys():
            self.Qtable[(new_state, 'left')] = 0
        if (new_state, 'forward') not in self.Qtable.keys():
            self.Qtable[(new_state, 'forward')] = 0
        if (new_state, 'right') not in self.Qtable.keys():
            self.Qtable[(new_state, 'right')] = 0

        self.Qtable[(old_state, action)] = (1 - self.alpha) * self.Qtable[(old_state, action)] + self.alpha *(reward + self.gamma * max(self.Qtable[(new_state, i)] for i in [None, 'left', 'forward', 'right']))
Exemplo n.º 18
0
class BasicAgent(Agent):
    """A basic agent upon which to build learning agents."""

    def __init__(self, env):
        super(BasicAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.qvals = {} # mapping (state, action) to q-values
        self.time = 0 # number of moves performed
        self.possible_actions = (None, 'forward', 'left', 'right')
        self.n_dest_reached = 0 # number of destinations reached
        self.last_dest_fail = 0 # last time agent failed to reach destination
        self.sum_time_left = 0 # sum of time left upon reaching destination over all trials
        self.n_penalties = 0 # number of penalties incurred
        self.last_penalty = 0 # last trial in which the agent incurred in a penalty

    def reset(self, destination=None):
        self.planner.route_to(destination)

    def best_action(self, state):
        """
        Return a random action (other agents will have different policies)
        """
        return random.choice(self.possible_actions)

    def update_qvals(self, state, action, reward):
        """
        Keeps track of visited (state, action) pairs. 
        (in other agents will use reward to update 
        the mapping from (state, action) pairs to q-values)
        """
        self.qvals[(state, action)] = 0
    
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        
        # update time
        self.time += 1

        # Update state
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'],
                      self.next_waypoint)

        # Pick an action
        action = self.best_action(self.state)
        
        # Execute action and get reward
        reward = self.env.act(self, action)
        if reward < 0:
            self.n_penalties += 1
        
        # Update the q-value of the (state, action) pair
        self.update_qvals(self.state, action, reward)
        
        # print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 19
0
class QLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.total_reward = 0
        self.next_waypoint = None
        self.QLearner = QAlgorithm(epsilon=0.03, alpha = 0.1, gamma = 0.9)
        self.next_state = None
        self.time_step = 1.0
    
    def get_decay_rate(self, t): #Decay rate for epsilon
       return 1.0 / float(t)
       
    def reset(self, destination=None):
        self.planner.route_to(destination)
        self.time_step = 1.0
        # TODO: Prepare for a new trip; reset any variables here, if required
        '''
        self.previous_action = None
        self.next_state = None
        self.total_reward = 0
        self.next_waypoint = None
        self.QLearner = QAlgorithm(epsilon=0.03, alpha = 0.1, gamma = 0.9)
        self.next_state = None
        '''
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        #env_states = self.env.agent_states[self]
        self.time_step += 1
        epsilon = self.get_decay_rate(self.time_step)
        self.QLearner.set_eps(epsilon)
        
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)
        #self.state = (inputs['light'], self.next_waypoint)

        action = self.QLearner.q_move(self.state)        
        
        # Execute action and get reward
        reward = self.env.act(self, action)
        self.total_reward += reward
        
        new_inputs = self.env.sense(self)
        #env_states = self.env.agent_states[self]
        new_deadline = self.env.get_deadline(self)
        self.next_state =  (new_inputs['light'], new_inputs['oncoming'], new_inputs['left'], self.next_waypoint)
        #self.next_state =  (new_inputs['light'], self.next_waypoint)
        
        # TODO: Learn policy based on state, action, reward

        self.QLearner.q_future_reward(self.state, action, self.next_state, reward)
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 20
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, alpha, q_initial):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.policy = {}
        self.trip_log = []
        self.trip = None
        self.alpha = alpha
        self.q_initial = q_initial

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        if self.trip:
            self.trip_log.append(self.trip)
        self.trip = {}
        self.trip['Deadline'] = self.env.get_deadline(self)
        self.trip['Reward'] = 0
        self.trip['Penalty'] = 0

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        inputs.pop('right')
        self.state = inputs
        self.state['waypoint'] = self.next_waypoint

        # TODO: Select action according to your policy
        state_hash = hashabledict(frozenset(self.state.iteritems()))
        q = self.policy.get(state_hash, {
            None: self.q_initial,
            'forward': self.q_initial,
            'left': self.q_initial,
            'right': self.q_initial,
        })
        action = max(q.iteritems(), key=operator.itemgetter(1))[0]

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Update trip stats
        self.trip['Reward'] += reward
        self.trip['Remaining'] = self.env.get_deadline(self)
        self.trip['Success'] = self.planner.next_waypoint() == None
        if reward < 0: self.trip['Penalty'] += reward

        # TODO: Learn policy based on state, action, reward
        q[action] = (1 - self.alpha) * q[action] + self.alpha * reward
        self.policy[state_hash] = q
Exemplo n.º 21
0
class BasicLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(BasicLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.last_reward = 0
        self.last_action = None
        self.last_state = None
        self.state = None
        self.total_reward = 0
        self.deadline = self.env.get_deadline(self)
        self.actions = ['forward', 'left', 'right', None]
        self.reached_destination = 0
        self.penalties = 0
        self.movements = 0

    def tuple_state(self, state):
        State = namedtuple("State", ["light", "next_waypoint"])
        return State(light = state['light'], next_waypoint = self.planner.next_waypoint()) 

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.last_reward = 0
        self.last_action = None
        self.last_state = None
        self.state = None
        self.total_reward = 0

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        
        # TODO: Update state
        self.state = self.tuple_state(inputs)

        # TODO: Select action according to your policy
        action = random.choice(self.actions)

        # Execute action and get reward
        reward = self.env.act(self, action)

        if reward >= 10: self.reached_destination += 1
        if reward < 0: self.penalties += 1
        self.movements += 1

        # TODO: Learn policy based on state, action, reward 
        self.last_action = action
        self.last_state = self.state
        self.last_reward = reward
        self.total_reward += reward
Exemplo n.º 22
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        
        # TODO: Initialize any additional variables here
        self.acts = ['left', 'right', 'forward', 'stay']
        df_tmp = pd.DataFrame(0*np.ones((2, 4)), index=['red', 'green'], columns=self.acts)
        self.Q = df_tmp
        self.deadline = 0
        

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        self.deadline = deadline

        # TODO: Update state
        self.state = inputs['light'] # in this case the state is simply defined by the color oof the traffic light at the intersection
        
        # TODO: Select action according to your policy
        # Q-Learner
        act_opt = self.Q.ix[self.state].argmax()
        if act_opt=='stay': act_opt=None
#            print "Current state = %s, qlearner action = %s" % (self.state, act_opt)
        action = act_opt

        # Execute action and get reward
        reward = self.env.act(self, action)


        # TODO: Learn policy based on state, action, reward

        # Maximize Q conditional on the state we are in = S
        q_S = self.Q.ix[self.state] # Q conditional on state S
        q_S_mx = q_S[q_S==q_S.max()] # just in case there are multiple values and the action that maximizes Q(S, a) is not unique
        ix_mx = np.random.randint(0,len(q_S_mx)) # pick one of the equally qlearner actions
        q_S_mx = q_S_mx.iloc[ix_mx]

        # Update Q
        act_tmp = action if action !=None else 'stay'
        global alpha, alpha0, gamma
        alpha *= alpha0
        self.Q.ix[self.state, act_tmp] = \
            (1-alpha)*self.Q.ix[self.state, act_tmp] + \
            alpha*(reward + gamma*q_S_mx) # note: we update the action that has been actually taken, but we update the contribution to Q(S, a) by the action that could have been taken had we behaved qlearnerly
Exemplo n.º 23
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.QLearning = QLearning(epsilon=0.3, alpha=0.95, gamma=0.9)
        # self.QLearning = pickle.load(open('./QLearning.pkl'))

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def is_action_okay(self, inputs):
        action_okay = True
        if self.next_waypoint == 'right':
            if inputs['light'] == 'red' and inputs['left'] == 'forward':
                action_okay = False
        elif self.next_waypoint == 'forward':
            if inputs['light'] == 'red':
                action_okay = False
        elif self.next_waypoint == 'left':
            if inputs['light'] == 'red' or (inputs['oncoming'] == 'forward' or inputs['oncoming'] == 'right'):
                action_okay = False

        return action_okay

    def update(self, t):
        # Gather states
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        action_okay = self.is_action_okay(inputs)
        self.state = (action_okay, self.next_waypoint, deadline < 3)
        # self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # TODO: Select action according to your policy
        action = self.QLearning.policy(self.state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Gather next states
        next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        next_inputs = self.env.sense(self)
        next_deadline = self.env.get_deadline(self)
        next_action_okay = self.is_action_okay(next_inputs)
        next_state = (next_action_okay, next_waypoint, next_deadline < 3)
        # next_state = (next_inputs['light'], next_inputs['oncoming'], next_inputs['left'], next_waypoint)

        # TODO: Learn policy based on state, action, reward
        self.QLearning.update_QTable(self.state, action, reward, next_state)
Exemplo n.º 24
0
class QLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Q = {}
        self.learning_rate = 0.9
        self.states = None
        self.possible_actions = [None, 'forward', 'left', 'right']
        

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)



        # TODO: Update state
        # state = (self.next_waypoint,inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'],deadline)
        state = (self.next_waypoint,inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'])
        # state = (self.next_waypoint,inputs['light'])
        self.state = state

        for action in self.possible_actions:
            if(state, action) not in self.Q: 
                self.Q[(state, action)] = 20

        # TODO: Select action according to your policy
        Q_best_value = [self.Q[(state, None)], self.Q[(state, 'forward')], self.Q[(state, 'left')], self.Q[(state, 'right')]]
        # Softmax
        Q_best_value = np.exp(Q_best_value) / np.sum(np.exp(Q_best_value), axis=0)
        
        #action = np.random.choice(self.possible_actions, p=probabilities)
        action = self.possible_actions[np.argmax(Q_best_value)]

        # Execute action and get reward
        reward = self.env.act(self, action)
        if reward == -10.0 :
            print "Invalid move"
        if reward == -0.5 :
            print "valid, but is it correct?"
        

        # TODO: Learn policy based on state, action, reward
        # self.Q[(state,action)] = reward + self.learning_rate * self.Q[(state,action)]
        self.Q[(state,action)] = reward * self.learning_rate + ( 1 - self.learning_rate ) * self.Q[(state,action)]
Exemplo n.º 25
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, learning_rate=0.78, discount_factor=0.36):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Q = {}
        self.discount_factor = discount_factor
        self.learning_rate = learning_rate
        self.actions = Environment.valid_actions
        self.oldAction = None


    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.oldAction=None
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        #------------------------------------------------
        #action = np.random.choice(self.actions)
        #------------------------------------------------
        


        # TODO: Select action according to your policy
        current_state = tuple(inputs.values() + [self.next_waypoint])
        self.state = (inputs, self.next_waypoint)
        
        if current_state not in self.Q:
        	self.Q[current_state] = [3,3,3,3]

        Q_max = self.Q[current_state].index(np.max(self.Q[current_state]))
        # Execute action and get reward
        action = self.actions[Q_max]
        reward = self.env.act(self, action)



        # TODO: Learn policy based on state, action, reward
        
        if t!=0:
			oldvalue = self.Q[self.oldAction[0]][self.oldAction[1]]
			newvalue = ((1 - self.learning_rate)*oldvalue)+(self.learning_rate * (self.oldAction[2] + self.discount_factor * self.Q[current_state][Q_max]))
			self.Q[self.oldAction[0]][self.oldAction[1]] = newvalue 
        
        self.oldAction = (current_state, self.actions.index(action), reward)
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 26
0
class RandomAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(RandomAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.availableAction = ['forward', 'left', 'right',None]   
        self.goal=0
        self.steps=0
        self.features=[]
        self.deadline=[]
        self.total_reward=[0]

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        #print"RESET, Final state:\n", self.state
        try:
            if self.deadline[len(self.features)-1] >0: #deadline less than zero
                self.goal+=1 #FIXME - order
                print "PASS! {} steps to goal,Goal reached {} times out of {}!".format(
                                                        self.deadline[len(self.features)-1],self.goal,len(self.features))
            else:
                print "FAIL! {} steps to goal,Goal reached {} times out of {}!".format(
                                                        self.deadline[len(self.features)-1],self.goal,len(self.features))
                pass
        except:
            print "Trial 0 - Goal reached {} times out of {}!".format(self.goal,len(self.features))
            pass
        print "----------------------------------------------------------"
        self.features.append({})
        self.deadline.append(None)
        self.total_reward.append(0)
        self.steps=0

    def update(self, t):
        # Gather inputs
        self.steps+=1
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        #self.deadline[len(self.features)] = self.env.get_deadline(self)
        self.state=inputs
        self.features[len(self.features)-1][self.steps]=inputs
        self.deadline[len(self.deadline)-1] = self.env.get_deadline(self)

        # TODO: Select action according to your policy
        action = self.availableAction[random.randint(0,3)]    
        
        # Execute action and get reward
        reward = self.env.act(self, action)
        self.lastReward=reward
        # TODO: Learn policy based on state, action, reward
        self.total_reward[len(self.total_reward)-1] =self.total_reward[len(self.total_reward)-1]+reward
Exemplo n.º 27
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here        
        self.actions = ['forward', 'left', 'right',None]        
        self.q_table = Counter()        
        self.alpha = 0.5
        self.gamma = 0.2


    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def get_best_action(self, state):
        best_profit = -999
        best_action = None
        for a in self.actions:
            profit = self.q_table[(state, a)]            
            if profit > best_profit:
                best_profit = profit
                best_action = a
        return best_action, best_profit


    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state        
        self.state = (inputs['light'],inputs['oncoming'], inputs['left'], self.next_waypoint)       

        if randint(0,10) < 3:
            action = self.actions[randint(0, 3)]
        else:        
            action, _ = self.get_best_action(self.state)
        
        reward = self.env.act(self, action)

        next_inputs = self.env.sense(self)
        next_next_waypoint = self.planner.next_waypoint()
        next_state = (next_inputs['light'],next_inputs['oncoming'], next_inputs['left'], next_next_waypoint)      
       
        _, best_profit = self.get_best_action(next_state)
        self.q_table[(self.state, action)] = (1-self.alpha)*self.q_table[(self.state, action)] + (self.alpha * (reward + self.gamma * best_profit))

       
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 28
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.debugOn = False
        self.reporter = Reporter(driver_type, self.env.valid_actions)
        if driver_type=='random':
            self.driver = RandomDriver(self.env.valid_actions)
        elif driver_type=='greedy':
            self.driver = GreedyDriver(self.env.valid_actions)
        elif driver_type=='greedy2':
            self.driver = Greedy2Driver(self.env.valid_actions)
        elif driver_type=='smart':
            self.driver = SmartDriver(self.env.valid_actions, self.reporter)
        elif driver_type=='smart2':
            self.driver = Smart2Driver(self.env.valid_actions, self.reporter)
        elif driver_type=='smart3':
            self.driver = Smart3Driver(self.env.valid_actions, self.reporter)
        else:
            self.driver = Smart2Driver(self.env.valid_actions, self.reporter)

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.reporter.reset(destination)

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = State.make(self.next_waypoint, inputs)
        
        # TODO: Select action according to your policy
        action = self.driver.decide(self.state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        newWaypoint = self.planner.next_waypoint()
        newState = State.make(newWaypoint, self.env.sense(self))
        self.driver.learn(t, deadline, self.state, action, reward, newState)

        location = self.env.agent_states[self]['location']
        if self.debugOn:
            print "LearningAgent.update(): deadline={} inputs={} action={} reward={} location={}".format(deadline, inputs, action, reward, location)  # [debug]
        self.reporter.update(t, deadline, self.state, action, reward, location, self.driver)
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, trials):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint

        # TODO: Initialize any additional variables here
        self.trials = trials
        self.success = 0  # How many times agent able to reach the target for given trials?

        self.penalties = 0
        self.total_rewards = 0

    def reset(self, destination=None):
        self.planner.route_to(destination)

        # TODO: Prepare for a new trip; reset any variables here, if required
        self.penalties = 0
        self.total_rewards = 0

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        # self.state = inputs
        # self.state = (inputs['light'], inputs['oncoming'], inputs['left'])
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # TODO: Select action according to your policy
        action = random.choice(self.env.valid_actions)  # Initial random movement / random action

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".\
            format(deadline, inputs, action, reward)  # [debug]

        # Statistics
        self.total_rewards += reward

        if reward < 0:
            self.penalties += 1

        if reward > 5:
            self.success += 1
            print "\nTrial info: Number of Penalties =  {}, Net rewards: {}. \nTotal success/trials: {}/{}. \n".\
                format(self.penalties, self.total_rewards, self.success, self.trials)
Exemplo n.º 30
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.total_reward = 0

        #Q matrix
        self.Q = {}

        # initial Q values
        self.initial_q = 30

        # Initialize state
        self.states = None

        # Dictionary of possible actions to index states from Q matrix
        self.actions = {None : 0, 'forward' : 1, 'left' : 2, 'right' : 3}

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # Check if this state is included in the Q matrix. If not, create a new row in the matrix.
        if self.state not in self.Q:
            self.Q[self.state] = [self.initial_q for i  in range(len(self.env.valid_actions))]

        # TODO: Select action according to your policy
        learning_rate = 0.8
        action = self.env.valid_actions[np.argmax(self.Q[self.state])]
        discount = 0
        # Execute action and get reward
        reward = self.env.act(self, action)
        self.total_reward += reward
        # TODO: Learn policy based on state, action, reward

        # get the future state
        future_state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # Update the Q matrix (add consideration of future reward)
        #self.Q[self.state][self.actions[action]] = self.Q[self.state][self.actions[action]] * (1 - learning_rate) + reward * learning_rate
        self.Q[self.state][self.actions[action]] = self.Q[self.state][self.actions[action]] + learning_rate * (reward + discount * self.Q[future_state][self.actions[action]] - self.Q[self.state][self.actions[action]])
Exemplo n.º 31
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.t = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0

        if testing:
            self.epsilon = 0
            self.alpha = 0

        if self.learning and not testing:
            # self.epsilon = self.epsilon - 0.05
            # self.epsilon = 1.0/(self.t)
            # self.t = self.t + 1.0
            # system("echo "+str(self.epsilon)+" >> eps")
            # self.epsilon = math.cos(self.t)
            #self.epsilon = math.exp(-0.1 * self.t)
            self.epsilon = abs(math.cos(0.01 * self.t))
            self.t = self.t + 1.0

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########

        # NOTE : you are not allowed to engineer features outside of the inputs available.
        # Because the aim of this project is to teach Reinforcement Learning, we have placed
        # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation.
        # With the hand-engineered features, this learning process gets entirely negated.

        # Set 'state' as a tuple of relevant data for the agent
        #if waypoint == 'right':
        #    state = (waypoint, inputs['left'], inputs['oncoming'])
        #if waypoint == 'forward':
        #    state = (waypoint, inputs['light'], inputs['left'])
        #if waypoint == 'left':
        #    state = (waypoint, inputs['light'], inputs['oncoming'])
        state = (waypoint, inputs['light'], inputs['left'], inputs['oncoming'])

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        maxQ = ['']
        max_val = -0xFFFFFF  # Minimum possible Q value
        print "\nChoose Max Q"

        for action in self.Q[state].keys():
            print "Max", max_val,
            print "Action", action,

            if self.Q[state][action] > max_val:
                print "Q Action Val", self.Q[state][action],
                max_val = self.Q[state][action]
                print "Max", max_val,
                maxQ[0] = action
                print "maxQ", maxQ,
            print
        print "maxQ", maxQ
        print

        for action in self.Q[state].keys():
            print "Action", action,
            if self.Q[state][action] == max_val and action != maxQ[0]:
                print "Q Action Val", self.Q[state][action],
                maxQ.append(action)
                print "maxQ", maxQ,
            print
        print
        print "maxQ", maxQ
        #s = raw_input("MaxQEh")
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        if self.learning:
            if state not in self.Q.keys():
                self.Q[state] = {}
                for action in self.valid_actions:
                    self.Q[state][action] = 0

                print "State", state
                print "QDict", self.Q

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie".

        if not self.learning:
            print "Not Learning",
            action = self.valid_actions[random.randint(
                0,
                len(self.valid_actions) - 1)]

        if self.learning:
            print "Learning",
            print "Epsilon", self.epsilon,
            action_probability = random.uniform(0, 1)
            print "Action Probability", action_probability,
            if action_probability <= self.epsilon:
                action = self.valid_actions[random.randint(
                    0,
                    len(self.valid_actions) - 1)]
                print "random action", action,

            else:
                print "Finding maxQ", self.Q[state],
                actions = self.get_maxQ(state)
                print "Actions", actions,
                if len(actions) == 1:
                    action = actions[0]
                else:
                    action = actions[random.randint(0, len(actions) - 1)]
                print "Action", action,

        print
        print "Given Env", self.env.sense(self)
        print "Next waypoint", self.next_waypoint
        #s = raw_input()
        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives a reward. This function does not consider future rewards
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            self.Q[state][action] = (self.Q[state][action] *
                                     (1.0 - self.alpha)) + (reward *
                                                            self.alpha)

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Exemplo n.º 32
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'yellow'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.reward = 0
        self.next_waypoint = None

        # Decalring the variable to calculate the success percentage rate

        self.S = 0

        #Declaring the trial counter variable

        self.var_trial = 0

        #Declaring the variable to track sucess rate

        self.var_trial_S = {}

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.next_waypoint = None

          # Incrementing the success percentage counter before reseting the trial

        try:
           if self.var_trial_S[self.var_trial] == 1:
              self.S = self.S + 1
        except:
           pass

        # reseting the trial

        self.var_trial = self.var_trial + 1


    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

               
        # TODO: Select action according to your policy
        action = None
        action = random.choice(Environment.valid_actions)
        
        '''
        # TODO: Update state
        action_okay = True
        if self.next_waypoint == 'right':
            if inputs['light'] == 'red' and inputs['left'] == 'forward':
                action_okay = False

        elif self.next_waypoint == 'straight':
            if inputs['light'] == 'red':
                action_okay = False

        elif self.next_waypoint == 'left':
            if inputs['light'] == 'red' or inputs['oncoming'] == 'forward' or inputs['oncoming'] == 'right':
                action_okay = False

        if not action_okay:
            action = None
        '''

        self.state = inputs
        self.state['next_waypoint'] = self.next_waypoint
        self.state = tuple(sorted(self.state.items()))

        # Execute action and get reward
        reward = self.env.act(self, action)
        self.reward = self.reward + reward
        # TODO: Learn policy based on state, action, reward

         # at the end of each trial we see the reward, if reward > 10 means the trial is successful else trial has failed 

        if (reward >= 10):
            self.var_trial_S[self.var_trial] = 1
        else:
            self.var_trial_S[self.var_trial] = 0

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Exemplo n.º 33
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        # Set any additional class parameters as needed
        self.trial = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0

        if testing:
            self.epsilon, self.alpha = 0, 0
        else:
            #self.epsilon = self.epsilon - 0.05 #for question 6
            self.trial += 1
            self.epsilon = math.exp(-0.01 * self.trial)
            if self.alpha >= 0.1:
                self.alpha = math.exp(-0.07 * self.trial)
            else:
                self.alpha

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########

        # Set 'state' as a tuple of relevant data for the agent

        # Originally set state as a tuple, but suspect encoding Q-table key as a tuple of tuples possibly resulted in key name collisions.
        # state = (("lights", inputs['light']),
        #("oncoming", inputs['oncoming']),
        #("left", inputs['left']),
        #("waypoint", self.next_waypoint))

        state = str(waypoint) + "_" + inputs['light'] + "_" + str(
            inputs['left']) + "_" + str(inputs['oncoming'])

        if not self.learning:
            random.choice(self.valid.actions)
        else:
            self.Q[state] = self.Q.get(state, {
                None: 0.0,
                'forward': 0.0,
                'left': 0.0,
                'right': 0.0
            })

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        # Calculate the maximum Q-value of all actions for a given state
        # preset an initialization value that should be replaced by a more valid Q value in the loop.
        maxQ = max(self.Q[state].values())
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        if self.learning and state not in self.Q:
            self.Q[state] = {
                None: 0.0,
                'left': 0.0,
                'right': 0.0,
                'forward': 0.0
            }
        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        if not self.learning:
            action = random.choice(self.valid_actions)
        else:
            coin_toss = random.random()
            if coin_toss <= self.epsilon:
                action = random.choice(self.valid_actions)
            else:
                actions_list = []
                highest_Qvalue = self.get_maxQ(state)
                for act in self.Q[state]:
                    if highest_Qvalue == self.Q[state][act]:
                        actions_list.append(act)
                action = random.choice(actions_list)
        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')

        if self.learning:
            self.Q[state][action] = self.alpha * reward + (
                1 - self.alpha) * self.Q[state][action]
        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Exemplo n.º 34
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """

    def __init__(self, env, learning=True, epsilon=0.5, alpha=0.4):
        super(LearningAgent, self).__init__(env)     # Set the agent in the environment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict()
        # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon   # Random exploration factor
        self.alpha = alpha       # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.decay_rate = 0.001
        self.a = 0.01
        self.n_trials = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        self.n_trials += 1

        if self.learning is not True:
            self.epsilon = 0
            self.alpha = 0
        elif self.learning is True:
            # self.epsilon = self.epsilon - self.decay_rate
            self.epsilon = np.exp(-(self.a*self.n_trials))

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the
            environment. The next waypoint, the intersection inputs, and the deadline
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(self)           # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        input1, input2, input3, input4 = inputs.items()
        state = (waypoint, input1, input2, input3, input4)

        return state

    def get_maxQ(self, state):
        """ The get_maxQ function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        # from https://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary
        maxQ = max(self.Q[state], key=self.Q[state].get)

        action_list = []

        for action in self.Q[state]:
            if self.Q[state][action] == self.Q[state][maxQ]:
                action_list.append(action)
            
        if len(action_list) > 0:
            maxQ = random.choice(action_list)

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        if self.learning:
            if state not in self.Q.keys():
                self.Q[state] = {action: 0.0 for action in self.valid_actions}

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        if not self.learning:
            action_number = random.randint(0, 3)
            action = self.valid_actions[action_number]
        elif self.learning:
            rand_num = random.random()
            if rand_num < self.epsilon:
                action_number = random.randint(0, 3)
                action = self.valid_actions[action_number]
            else:
                action = self.get_maxQ(state=state)

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        #       Set discount factor to zero *from Kevin Robertson

        # Transitions <s,a,r,s'>

        # sought some help from the forums and lecture videos.
        # update Q-value = current Q-value * (1 - alpha) + alpha * (current reward + discount factor * expected future reward)


        self.Q[state][action] = self.Q[state][action] * (1 - self.alpha) + self.alpha * reward

        return

    def update(self):
        """ The update function is called when a time step is completed in the
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()           # Get current state
        self.createQ(state)                  # Create 'state' in Q-table
        action = self.choose_action(state)   # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)    # Q-learn

        return
Exemplo n.º 35
0
class LearningAgent_v2(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent_v2, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.trip_history = []
        self.debug = True
        self.gamma = 0.2  #upper bound of discount
        #self.alpha = 0.5 #uppder bound of learning rate
        self.epsilon = 0.1  #lower bound of proportion of random steps
        self.reg = 0.001  # regularization param for regression
        self.lr = 0.1  # learning rate for regression
        self.clock_update = 0  # store number of updates
        self.init_params_scale = 1e-4  # scale of initial params setting

        self.max_memory = 400  # number of rows for state_action_experience
        self.batch_size = 20  # size of batch
        self.batch_step = 20  # extract a batch for each batch_step steps
        self.param_step = self.max_memory  # how many step should update w

        self.state_feature = [
            'right_no', 'forward_no', 'left_no', 'next_right', 'next_forward',
            'next_left'
        ]
        self.action_feature = ['right', 'forward', 'left']

        self.state = None

        self.num_action_space = np.concatenate(
            (np.diag(np.ones(3)), np.zeros(3)[np.newaxis, :]))

        self.state_action_feature = self.state_feature + self.action_feature + [
            x + "_action_" + y for x in self.state_feature
            for y in self.action_feature
        ]
        #self.state_action_df = pd.DataFrame(columns = (self.state_action_feature + ['Q_score']) )
        self.state_action_experience = np.zeros(
            (1, len(self.state_action_feature)))
        self.Q_score_experience = np.zeros(1)

        self.ex_state = np.zeros(len(self.state_feature))
        self.ex_state_action = np.zeros(len(self.state_action_feature))
        self.ex_reward = 0

        self.params = {
            'b':
            np.random.randn(1),
            'w':
            self.init_params_scale *
            np.random.randn(len(self.state_action_feature), 1)
        }

        self.params_update = self.params

        self.reward_history = np.zeros(1)

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.ex_reward = 0
        self.ex_state_action = np.zeros(len(self.state_action_feature))
        self.ex_state = np.zeros(len(self.state_feature))

        if (len(self.trip_history) < 150):
            print 'Current success rate is {}'.format(
                sum(self.trip_history) / (len(self.trip_history) + 0.000001))
        else:
            print 'Success rate for recent 100 trials is {}'.format(
                sum(self.trip_history[-100:]) /
                (len(self.trip_history[-100:]) + 0.000001))
        print 'Average reward for recent moves is {}'.format(
            np.mean(self.reward_history))
        if (self.reward_history.shape[0] > 1000):
            self.reward_history = np.delete(self.reward_history, range(100))

    def numeric_state(self, inputs=None, deadline=0, next_waypoint=None):
        #print 'inputs is {}, deadline is {}, next_waypoint is {}'.format(str(inputs), str(deadline), str(next_waypoint))
        col_name = self.state_feature
        state = np.zeros(len(col_name))
        state += np.array(
            map(lambda x: x == 'next_' + str(next_waypoint), col_name))

        if inputs['light'] == 'red' and inputs['left'] == 'forward':
            #state += np.array( map(lambda x: x=='right_no', col_name) )
            state[0] = 1
        if inputs['light'] == 'red':
            state[1] = 1
        if inputs['light'] == 'red' or (inputs['oncoming'] == 'forward'
                                        or inputs['oncoming'] == 'right'):
            state[2] = 1
        #state[len(col_name)-1] = deadline

        if False:
            print 'inputs is {}, deadline is {}, next_waypoint is {}\n'.format(
                str(inputs), str(deadline), str(next_waypoint))
            print zip(col_name, state)
            raw_input("Press Enter to continue...")

        return state

    def numeric_action(self, action=None):
        col_name = self.action_feature
        return np.array(map(lambda x: x == str(action), col_name))

    def numeric_state_action(self, num_state=None, num_action=None):
        return np.concatenate(
            (num_state, num_action, np.outer(num_state, num_action).flatten()),
            axis=0)

    def max_Q_param(self, num_state):
        X = np.apply_along_axis(
            lambda x: self.numeric_state_action(num_state, x),
            axis=1,
            arr=self.num_action_space)
        score = X.dot(self.params['w']) + self.params['b']

        if False:
            print '\nX are\n {}\n, Params are\n {}\n'.format(
                str(X), str(self.params))
            raw_input("Press Enter to continue...")

        choose = np.argmax(score)
        opt_action = None
        if choose < 3:
            opt_action = self.action_feature[choose]
        num_state_action = X[choose]
        max_Q_hat = score[choose]

        if False:
            print '\nScores are\n {}\n, opt action are\n {}\n'.format(
                str(score), str(opt_action))
            raw_input("Press Enter to continue...")

        return opt_action, max_Q_hat, num_state_action

    def gradient(self, X, y, reg=0.01):
        if False:
            print '\nX are\n {}\n and y are\n {}\n'.format(str(X), str(y))
            raw_input("Press Enter to continue...")

        w, b = self.params_update['w'], self.params_update['b']
        scores = X.dot(w) + b
        y = y.flatten()[:, np.newaxis]
        loss = np.mean((y - scores)**2) + 0.5 * reg * np.sum(w**2)

        if False:
            print '\ny are\n {}\n and scores are\n {}\n and loss is\n {}\n'.format(
                str(y), str(scores), str(loss))
            raw_input("Press Enter to continue...")

        d_w = np.mean(
            (X * ((scores - y) * 2)), axis=0)[:, np.newaxis] + reg * w
        d_b = np.mean((scores - y) * 2)

        return d_w, d_b, loss

    def sample_X_y(self, size=10):
        idx = np.random.randint(self.state_action_experience.shape[0],
                                size=size)
        X = self.state_action_experience[idx, :]
        y = self.Q_score_experience[idx]

        return X, y

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        #may need to take deadline into account?

        # TODO: Update state
        self.state_action_experience = np.concatenate(
            (self.state_action_experience,
             self.ex_state_action[np.newaxis, :]))
        num_state = self.numeric_state(inputs=inputs,
                                       deadline=deadline,
                                       next_waypoint=self.next_waypoint)

        self.state = zip(self.state_feature, num_state)

        # TODO: Select action according to your policy

        action, max_Q_hat, num_state_action = self.max_Q_param(num_state)

        if (random.uniform(0, 1) < self.epsilon):
            action = random.choice(Environment.valid_actions[:])
            num_action = self.numeric_action(action)
            num_state_action = self.numeric_state_action(num_state=num_state,
                                                         num_action=num_action)
            if False:
                print "\n Use a random action, {}".format(str(action))
                #debug
                raw_input("Press Enter to continue...")

        true_Q_score = self.ex_reward + self.gamma * max_Q_hat
        self.Q_score_experience = np.append(self.Q_score_experience,
                                            true_Q_score)
        self.clock_update += 1

        if False:
            print '\nShape of State Action expreience Matrix is {}\n'.format(
                self.state_action_experience.shape)
            print '\nShape of Q score experience is {}\n'.format(
                self.Q_score_experience.shape)
            raw_input("Press Enter to continue...")

        # TODO: Learn policy based on state, action, reward
        reward = self.env.act(self, action)

        self.ex_reward = reward
        self.ex_state_action = num_state_action
        self.reward_history = np.append(self.reward_history, reward)

        if reward > 9:
            self.trip_history.append(1)
            #need to write down something here
            self.state_action_experience = np.concatenate(
                (self.state_action_experience,
                 self.ex_state_action[np.newaxis, :]))
            self.Q_score_experience = np.append(self.Q_score_experience,
                                                reward)
            self.clock_update += 1
        elif deadline == 0:
            self.trip_history.append(0)
            self.state_action_experience = np.concatenate(
                (self.state_action_experience,
                 self.ex_state_action[np.newaxis, :]))
            self.Q_score_experience = np.append(self.Q_score_experience,
                                                reward)
            self.clock_update += 1

        if (self.clock_update > self.max_memory + 2):
            self.state_action_experience = np.delete(
                self.state_action_experience,
                range(self.state_action_experience.shape[0] - self.max_memory),
                0)
            self.Q_score_experience = np.delete(
                self.Q_score_experience,
                range(len(self.Q_score_experience) - self.max_memory))

            if False:
                print '\nShape of State Action expreience Matrix is {}\n'.format(
                    self.state_action_experience.shape)
                print '\nShape of Q score experience is {}\n'.format(
                    self.Q_score_experience.shape)
                raw_input("Press Enter to continue...")

            if (self.clock_update % self.batch_step == 0):
                for i in xrange(2):
                    if False:
                        print '\nUpdated Parameters are {}\n'.format(
                            str(self.params_update))
                        raw_input("Press Enter to continue...")
                    data_X, data_y = self.sample_X_y(size=self.batch_size)
                    d_w, d_b, loss = self.gradient(data_X,
                                                   data_y,
                                                   reg=self.reg)
                    if False:
                        print '\nGradiants are {} and {}\n'.format(
                            str(d_w), str(d_b))
                        raw_input("Press Enter to continue...")

                    if False:
                        print '\nloss is {}\n'.format(loss)
                        raw_input("Press Enter to continue...")

                    self.params_update[
                        'w'] = self.params_update['w'] - self.lr * d_w
                    self.params_update[
                        'b'] = self.params_update['b'] - self.lr * d_b

            if self.clock_update % self.param_step == 0:
                self.params = self.params_update
                if True:
                    print '\nBias for regression is {}\n'.format(
                        str(self.params['b']))
                    weight_df = pd.DataFrame(data=self.params['w'].T,
                                             columns=self.state_action_feature)
                    print '\nWeights for regression is\n{}\n'.format(
                        weight_df.T)
Exemplo n.º 36
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self,
                 env,
                 discount_rate=0.1,
                 epsilon=0.15,
                 epsilon_decay=.99):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.actions = (None, 'forward', 'left', 'right')
        self.Q = {}  #hashable Q-Table
        self.default_Q = 1  #initially tried zero
        self.epsilon = epsilon  #controls frequency of random actions [note: not used when implementing epsilon decay]
        self.epsilon_decay = epsilon_decay  #decay rate (multiplier) for GLIE
        self.alpha = 1  # learning rate [note: not used when updating/decaying alpha]
        self.initial_alpha = 1  #starting point for alpha-updating model
        self.discount_rate = discount_rate  #controls extent to which we value future rewards

        # keep track of net reward, trial no., penalties
        self.net_reward = 0
        self.trial = 0

        self.success = 0
        self.net_penalty = 0
        self.penalty_tracker = []
        self.reward_tracker = []
        self.success_tracker = []
        self.trial_tracker = []

        #initialize Q-table with default values
        for light_state in ['green', 'red']:
            for oncoming_traffic_state in self.actions:
                for right_traffic_state in self.actions:
                    for left_traffic_state in self.actions:
                        for waypoint_state in self.actions[
                                1:]:  #ignore NONE - this is not a waypoint possibility
                            # record each state
                            state = (light_state, oncoming_traffic_state,
                                     right_traffic_state, left_traffic_state,
                                     waypoint_state)
                            self.Q[state] = {}
                            for action in self.actions:
                                self.Q[state][action] = self.default_Q

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.trial += 1
        self.net_reward = 0
        self.net_penalty = 0

    #for the random agent
    def get_random_action(self, state):
        return random.choice(self.actions)

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'],
                      inputs['right'], self.next_waypoint)

        # TODO: Select action according to your policy

        #random agent approach
        #best_action = self.get_random_action(self.state)

        #Q-learning approach
        if random.random() < self.epsilon:  # add randomness to the choice
            best_action = random.choice(self.actions)

        else:
            best_action = None
            max_Q = None
            #find action with best Q value
            for available_action in self.actions:
                if self.Q[self.state][available_action] > max_Q:
                    best_action = available_action
                    max_Q = self.Q[self.state][available_action]

        #EPSILON DECAY. COMMENT OUT FOR INITIAL PARAMETER TUNING
        self.epsilon = 1 / (self.trial + self.epsilon_decay)

        # Execute action and get reward
        reward = self.env.act(self, best_action)
        self.net_reward += reward

        #successful if reward is large. count this and also store other stats from this trial.
        if reward > 8:
            self.success += 1

        if reward < 0:
            self.net_penalty += reward

        #if done, update stats
        if self.env.done == True or deadline == 0:
            self.penalty_tracker.append(self.net_penalty)
            self.reward_tracker.append(self.net_reward)
            if deadline == 0:
                self.success_tracker.append(0)
            else:
                self.success_tracker.append(1)
            self.trial_tracker.append(
                40 - deadline)  #number of turns it took to reach the end

        #if all trials done, show progression of performance indicators over time
        if self.trial == 100:
            print "net penalties over time: " + str(self.penalty_tracker)
            print "net rewards over time: " + str(self.reward_tracker)
            print "success pattern: " + str(self.success_tracker)
            print "trials required per round: " + str(self.trial_tracker)

        # TODO: Learn policy based on state, action, reward

        #get next state
        next_state = self.env.sense(self)
        next_waypoint = self.planner.next_waypoint()

        #re-represent this as a tuple since that's how the Q dict's keys are formatted
        next_state = (next_state['light'], next_state['oncoming'],
                      next_state['left'], next_state['right'], next_waypoint)
        #add state to q dict if not already there
        if next_state not in self.Q:
            self.Q[next_state] = {}
            for next_action in self.actions:
                self.Q[next_state][next_action] = self.default_Q

        utility_of_next_state = None
        #search through available actions, find one with higest Q
        for next_action in self.Q[next_state]:
            if self.Q[next_state][next_action] > utility_of_next_state:
                utility_of_next_state = self.Q[next_state][next_action]

        #next get utility of state
        utility_of_state = reward + self.discount_rate * utility_of_next_state

        # update Q Table
        self.Q[self.state][best_action] = (1 - self.alpha) * \
            self.Q[self.state][best_action] + self.alpha * utility_of_state

        #alpha update. comment this out when testing other inputs in isolation.
        #decay needed to ensure Q convergence. source: http://dreuarchive.cra.org/2001/manfredi/weeklyJournal/pricebot/node10.html
        if self.trial != 0:
            #self.alpha = 1 / (self.trial + self.epsilon_decay)
            num_trials = 100
            self.alpha = (self.initial_alpha * (num_trials / 10.0)) / (
                (num_trials / 10.0) + self.trial)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}, successes = {}".format(
            deadline, inputs, best_action, reward, self.success)  # [debug]
Exemplo n.º 37
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.A = ['forward', 'left', 'right',None] # all avaiable action
        self.trial = 0 # the number of trials
        self.Q = {} # Init Q table(light, next_waypoint)
	self.init_value=1.5 #the value to initilaize the q table with
        for i in ['green', 'red']:  # possible lights
            for j in ['forward', 'left', 'right']:  ## possible next_waypoints
                self.Q[(i,j)] = [self.init_value] * len(self.A)  ## linized Q table
	self.gamma = 0.3  # discount factor 
        self.alpha = 0.5  # learning rate
	self.epsilon = 0.1 # prob to act randomly, higher value means more exploration, more random action
	self.reward_holder='' #str to hold neg rewards and split term
	self.breaker=0 #value to calculate when new trial occurs
	

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (inputs['light'], self.next_waypoint)
        
        # TODO: Select action according to your policy
        #action = random.choice(self.env.valid_actions) # random action
        
	# Find the max Q value for the current state
        max_Q = self.Q[self.state].index(max(self.Q[self.state]))

        # assign action 
        p = random.random() #generate random value between 0 and 1
	#simulated annealing appraoch to avoid local opt, mainly to avoid staying in the same place      
	if p<=self.epsilon:
            action = random.choice(self.env.valid_actions)
        else:
            action = self.A[max_Q]

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        '''
	#updating learning rate
        alpha_tune =500 # tunning parameter
        alpha = 1/(1.1+self.trial/alpha_tune) # decay learning rate
        self.trial = self.trial+1
	'''
        ## get the next state,action Q(s',a')
        next_inputs = self.env.sense(self)
        next_next_waypoint = self.planner.next_waypoint()
        next_state = (next_inputs['light'], next_next_waypoint)

        ## update Q table
        self.Q[self.state][self.A.index(action)] = \
            (1-self.alpha)*self.Q[self.state][self.A.index(action)] + \
	    (self.alpha * (reward + self.gamma * max(self.Q[next_state])))
       
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
	
	#to get an idea of penalties in later trials	
	if deadline>=self.breaker:
            self.reward_holder+='3 ' #will split string on 5 later and only be left with neg rewardfor a given trial
	self.breaker=deadline
	if reward<0:	
	    self.reward_holder+=str(reward)+' '
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here

        # Initialize Q
        # One dimension for state, another dimension for actions
        # By doing so, you can just map the corresponding state to choose action in the future
        self.Q = {}
        for i in ['green', 'red']:
            for j in [None, 'forward', 'left', 'right']:
                for k in self.env.valid_actions:
                    self.Q[(i, j, k)] = [3] * len(self.env.valid_actions)

        # Initialize basic parameters of the Q-learning equation
        # Learning rate
        self.alpha = 0.3
        # Discount rate
        self.gamma = 0.3
        # Simulated annealing
        self.epsilon = 10
        # Trials for plotting
        self.trials = -1
        self.max_trials = 20
        self.x_trials = range(0, self.max_trials)
        self.y_trials = range(0, self.max_trials)

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.trials = self.trials + 1

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        # These are the valid inputs from valid_inputs in the class Environment in environment.py
        # 'light', 'oncoming', 'left', 'right'
        # Using 'light' and next_waypoint,  we've tuples here
        self.state = (inputs['light'], inputs['oncoming'], self.next_waypoint)

        # TODO: Select action according to your policy

        # 1. We can find the action corresponding to the maximum value of Q for the current state
        # self.Q[self.state] accesses the current state
        # index(max(self.Q[self.state])) maximizes the value of Q for that state
        max_Q = self.Q[self.state].index(max(self.Q[self.state]))

        # 2. Then we select the action to maximize the value of Q

        # "Simulated annealing"
        # to allow the agent to still explore the environment
        # and not just still the the most conservative approach
        # It is a probabilistic technique for approximating the global optimum of a given function

        # This implies epsilon (probability of random action) is 10%
        # 0 and 1 can be the only values less than epsilon, 2

        #action = random.choice(self.env.valid_actions)

        a = random.randrange(0, 2)

        if a < self.epsilon:
            action = random.choice(self.env.valid_actions)
        else:
            action = self.env.valid_actions[max_Q]

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        # The subsequent next of state and action, Q(s',a')
        next_inputs = self.env.sense(self)
        next_next_waypoint = self.planner.next_waypoint()
        next_state = (next_inputs['light'], next_inputs['oncoming'],
                      next_next_waypoint)

        # Update Q
        Q_old = self.Q[self.state][self.env.valid_actions.index(action)]
        Q_next_utility = reward + self.gamma * max(self.Q[next_state])
        self.Q[self.state][self.env.valid_actions.index(action)] = \
            (1 - self.alpha) * Q_old + (self.alpha * Q_next_utility)

        # Determine if trial is successful (1) or not (0)
        if (deadline == 0) & (reward < 10):
            self.y_trials[self.trials] = 0
        else:
            self.y_trials[self.trials] = 1
Exemplo n.º 39
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)# sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'black'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        
        # TODO: Initialize any additional variables here
        self.state = None
        self.action = None
        self.alpha = 0.50
        self.gamma = 0.05
        self.epsilon = .01
        self.q_table = {}
        self.actions = [None, 'forward', 'left', 'right']
        self.trips = 0 
        softmax_probabilities = {} 
        self.previous_state = None
        self.last_action = None 
        self.last_reward = None
        
    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        # 
        self.trips += 1
        if self.trips >= 10:
            self.epsilon == .10
            if self.trips >= 20: 
                self.epsilon == 0.00


    def get_action(self, state,softmax_probabilities):
        #limited randomness on action choice per decreasing self.epsilon--avoid local min/max
        if random.random()< self.epsilon:
            action = np.random.choice(self.actions, p = softmax_probabilities) 
        else: 
            action = self.actions[np.argmax(softmax_probabilities)]
        return action 

    def update(self, t):
        self.next_waypoint = self.planner.next_waypoint() #from route planner,also displayed, simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
    
        #TODO: Update state
        state = (inputs['light'], inputs['oncoming'], inputs['right'],inputs['left'], self.next_waypoint)
        self.state = state
        
        # Initialization of q_table values to .97
        for action in self.actions:
            if(state,action) not in self.q_table:
                self.q_table[(state,action)] = .75
            
        #I used softmax function of q in q_table--so that 0-1 values for q-score  
        softmax_probabilities =[self.q_table[(state,None)],self.q_table[(state,'forward')], self.q_table[(state,'left')], self.q_table[(state, 'right')]] 
        # softmax = (e ^ x) / sum(e ^ x) 
        softmax_probabilities = np.exp(softmax_probabilities)/ np.sum(np.exp(softmax_probabilities), axis = 0)
        #Execute the action get reward
        action = self.get_action(self.state, softmax_probabilities)
        # Get a reward for the action
        reward = self.env.act(self, action)
        

# TODO: Learn policy based on state, action, reward  
        if self.previous_state == None: 
            self.q_table[(state, action)] = ((1-self.alpha) * reward) + (self.alpha * self.q_table[(state, action)])
        else: 
            current_q = self.q_table[(self.previous_state, self.last_action)]
            future_q = self.q_table[(state,action)]
            current_q = (1 - self.alpha) * current_q + self.alpha * (self.last_reward + self.gamma * future_q)
            self.q_table[(self.previous_state, self.last_action)] = current_q
        
        # use previous_state to store the last state so that I can lag between current state and next 
        self.previous_state = self.state
        self.last_action = action 
        self.last_reward = reward
Exemplo n.º 40
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.t = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0

        #self.epsilon = self.epsilon -0.05

        self.a = 0.01
        self.t = self.t + 1

        if testing:
            self.epsilon, self.alpha = 0, 0

        else:
            #self.epsilon = (self.a)**self.t
            #self.epsilon =1.0/(self.t**2)
            self.epsilon = math.exp(-(self.a * self.t))

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the
            environment. The next waypoint, the intersection inputs, and the deadline
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        # When learning, check if the state is in the Q-table
        #   If it is not, create a dictionary in the Q-table for the current 'state'
        #   For each action, set the Q-value for the state-action pair to 0

        state = (waypoint, inputs['light'], inputs['left'], inputs['right'],
                 inputs['oncoming'])

        if self.learning:
            self.createQ(state)

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        ## My example for easy understanding
        #Q = {'left':10.0, 'right':11.0, 'forward':-5.0, None :0.0 }
        #max(Q)---> right

        #row = self.Q[state]
        #print row
        #maxkey = max(row)
        #maxQ = self.Q[state][maxkey]
        #maxQ = max(self.Q[state][action] for action in self.valid_actions)
        maxQ = max(self.Q[state].values())

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        ## My example for easy understanding
        #a = {'One':1, 'Two':2}
        #keyword = 'Two'
        #print keyword in a

        #if self.learning:
        #    if state not in self.Q:
        #        self.Q[state] = {'left':0.0, 'right':0.0, 'forward':0.0, None :0.0 }

        if (self.learning) and (state not in self.Q):
            self.Q[state] = {action: 0 for action in self.valid_actions}

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        #action = None

        ###########
        ## TO DO ##
        ###########

        ## My example for easy understanding
        #import random
        #foo = ['a', 'b', 'c', 'd', 'e']
        #print(random.choice(foo)

        if self.learning:
            if self.epsilon > random.random():
                action = random.choice(self.valid_actions)
            else:
                #action = max(self.Q[state])
                #action = self.get_maxQ(state)
                maxQ = self.get_maxQ(state)
                action = random.choice([
                    action for action in self.valid_actions
                    if self.Q[state][action] == maxQ
                ])
        else:
            action = random.choice(self.valid_actions)

        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')

        #next_waypoint = self.planner.next_waypoint()
        #next_inputs = self.env.sense(self)
        #next_light = next_inputs['light']
        #next_left = next_inputs['left']
        #next_right = next_inputs['right']
        #next_oncoming = next_inputs['oncoming']
        #next_state = (next_waypoint, next_light, next_left, next_right, next_oncoming)

        self.Q[state][action] = (
            1 - self.alpha) * self.Q[state][action] + self.alpha * reward
        #self.Q[state][action] = (1-self.alpha)*self.Q[state][action] + self.alpha*(reward+self.get_maxQ(next_state))
        #self.Q[state][action] = (1-self.alpha)*self.Q[state][action] + self.alpha*(reward + self.get_maxQ(next_state)- self.Q[state][action])
        #self.Q[state][action] = reward*self.alpha + (1-self.alpha)*self.Q[state][action]

        return

    def update(self):
        """ The update function is called when a time step is completed in the
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Exemplo n.º 41
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor
        self.t = 0
        self.a = 0.000001
        # self.a = 0.00001, alpha = 0.2, epsilon = 1-a*t**2 --> A+ Safety/A+ Reliability, suboptimal grid
        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        self.t = self.t + 1
        self.epsilon = 1 - self.a * self.t**2
        # If 'testing' is True, set epsilon and alpha to 0
        if testing:
            self.epsilon = 0
            self.alpha = 0

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########

        # NOTE : you are not allowed to engineer features outside of the inputs available.
        # Because the aim of this project is to teach Reinforcement Learning, we have placed
        # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation.
        # With the hand-engineered features, this learning process gets entirely negated.

        # Set 'state' as a tuple of relevant data for the agent
        #state = None
        state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left'])

        return state

    def get_maxQ(self, state):
        """ The get_maxQ function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        maxQ = -100000000
        for each in self.Q[state]:
            val = self.Q[state][each]
            if val > maxQ:
                maxQ = val

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        temp = ['None' if x == None else str(x) for x in state]
        strState = '_'.join(temp)
        if self.learning:
            temp = {}
            for action in self.valid_actions:
                temp[action] = 0.0
            self.Q[state] = self.Q.get(state, temp)

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        if not self.learning:
            action = random.choice(self.valid_actions)
        else:
            p = random.uniform(0, 1)
            if p < self.epsilon:
                action = random.choice(self.valid_actions)
            else:
                maxVal = -100000000
                for each in self.Q[state]:
                    val = self.Q[state][each]
                    if val > maxVal:
                        action = each
                        maxVal = val
                    elif val == maxVal:
                        print action, each
                        action = random.choice([action, each])

        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie".
        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives a reward. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            # Q-value iteration update rule
            # (1-alpha)*Q(st,at) + alpha (rt+1 + gamma*maxQ(st+1,a))
            self.Q[state][action] = (
                1 - self.alpha) * self.Q[state][action] + self.alpha * (reward)

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Exemplo n.º 42
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        # Set any additional class parameters as needed
        self.t = 0
        random.seed(1000)

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing:
            self.epsilon = 0.0
            self.alpha = 0.0
        else:
            # commented out testing parameters
            # self.epsilon = self.epsilon - 0.05
            self.t += 1.0
            # self.epsilon = 1.0/(self.t**2)
            # self.epsilon = 1.0/(self.t**2 + self.alpha*self.t)
            # self.epsilon = math.fabs(math.cos(self.alpha*self.t))
            self.epsilon = math.fabs(math.cos(self.alpha * self.t))

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the
            environment. The next waypoint, the intersection inputs, and the deadline
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        # Set 'state' as a tuple of relevant data for the agent
        # When learning, check if the state is in the Q-table
        #   If it is not, create a dictionary in the Q-table for the current 'state'
        #   For each action, set the Q-value for the state-action pair to 0

        # helper to create state string
        def xstr(s):
            if s is None:
                return 'None'
            else:
                return str(s)

        state = xstr(waypoint) + "_" + inputs['light'] + "_" + xstr(
            inputs['left']) + "_" + xstr(inputs['oncoming'])
        if self.learning:
            self.Q[state] = self.Q.get(state, {
                None: 0.0,
                'forward': 0.0,
                'left': 0.0,
                'right': 0.0
            })
        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        # Calculate the maximum Q-value of all actions for a given state
        # preset an initialization value that should be replaced by a more valid Q value in the loop.
        maxQ = -1000.0
        for action in self.Q[state]:
            if maxQ < self.Q[state][action]:
                maxQ = self.Q[state][action]
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning:
            self.Q[state] = self.Q.get(state, {
                None: 0.0,
                'forward': 0.0,
                'left': 0.0,
                'right': 0.0
            })

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        if not self.learning:
            action = random.choice(self.valid_actions)
        else:
            if self.epsilon > 0.01 and self.epsilon > random.random():
                action = random.choice(self.valid_actions)
            else:
                valid_actions = []
                maxQ = self.get_maxQ(state)
                for act in self.Q[state]:
                    if maxQ == self.Q[state][act]:
                        valid_actions.append(act)
                action = random.choice(valid_actions)
        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards
            when conducting learning. """

        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            self.Q[state][action] = self.Q[state][action] + self.alpha * (
                reward - self.Q[state][action])

        return

    def update(self):
        """ The update function is called when a time step is completed in the
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Exemplo n.º 43
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'black'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        self.valid_actions = Environment.valid_actions

        # Intializing previous action, state, reward.
        self.prev_action = None
        self.prev_state = None
        self.prev_reward = None

        #initialize the Q_table
        self.Q = {}

        #Parameters for Q-Learning
        self.alpha = 0.85
        self.gamma = 0.45
        self.epsilon = 0.001
        self.default_Q = 10

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.next_waypoint = None
        self.prev_action = None
        self.prev_state = None
        self.prev_reward = None

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        #Update inputs to include all relevant variables (next_waypoint, status, light, left)
        inputs['waypoint'] = self.next_waypoint
        del inputs['right']

        #Convert dictionary values of inputs into tuple to reflect state
        self.state = inputs
        state = tuple(inputs.values())

        # TODO: Select action according to your policy
        best_action = None

        #Exploration
        if random.random() <= self.epsilon:
            best_action = random.choice(self.valid_actions)
            if (state, best_action) not in self.Q.keys():
                self.Q[(state, best_action)] = self.default_Q
            Q_value = self.Q[(state, best_action)]

        else:
            Q_values = []
            for action in self.valid_actions:
                if (state, action) not in self.Q.keys():
                    self.Q[(state, action)] = self.default_Q
                Q_values.append(self.Q[(state, action)])

            #Find best_action and Q_value
            max_Q = max(Q_values)
            index = []
            for i, x in enumerate(Q_values):
                if x == max(Q_values):
                    index.append(i)
            i = random.choice(index)
            best_action = self.valid_actions[i]
            Q_value = Q_values[i]

        # Execute action and get reward
        action = best_action
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        if self.prev_state != None:
            self.Q[(self.prev_state,
                    self.prev_action)] = (1 - self.alpha) * self.Q[
                        (self.prev_state, self.prev_action)] + self.alpha * (
                            self.prev_reward + self.gamma *
                            (self.Q[(state, action)]))

        #Update previous state, action, and reward
        self.prev_action = action
        self.prev_state = state
        self.prev_reward = reward

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(
            deadline, inputs, action, reward)  # [debug]
Exemplo n.º 44
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=True, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        global t
        t = t + 1
        #        self.epsilon = self.epsilon - 0.05
        self.epsilon = math.exp(-0.01 * t)
        #        self.alpha =.5* math.exp(-0.01* t)

        if testing == True:
            self.epsilon = 0
            self.alpha = 0
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########

        # NOTE : you are not allowed to engineer eatures outside of the inputs available.
        # Because the aim of this project is to teach Reinforcement Learning, we have placed
        # constraints in order for you to learn how to adjust epsilon and alpha, and thus
        # learn about the balance between exploration and exploitation.
        # With the hand-engineered features, this learning process gets entirely negated.

        # Set 'state' as a tuple of relevant data for the agent
        state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left'])

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        maxQ = max(self.Q[state].values())

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########

        if self.learning == True:
            if state not in self.Q.keys():
                self.Q[state] = {el: 0 for el in self.valid_actions}
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        max_value = self.get_maxQ(state)
        best_actions = [
            key for key, value in self.Q[state].items() if value == max_value
        ]
        if self.learning == False:
            action = random.choice(self.valid_actions)
        elif self.epsilon > random.random():
            action = random.choice(self.valid_actions)
        else:
            action = random.choice(best_actions)

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie".
        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives a reward. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        if self.learning == True:
            self.Q[state][action] = self.alpha * reward + \
            (1-self.alpha) * self.Q[state][action]
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Exemplo n.º 45
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.alpha = 0.5
        self.gamma = 0.5
        self.epsilon = 1.0
        self.floor = 0.05
        self.log = True

        self.qinit = {'wait': .0, 'forward': .0, 'left': .0, 'right': .0}
        self.qdf = pd.DataFrame()

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.env.totalreward = 0.0

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state

        # Filter state space
        # Can ignore traffic when red light except for potential collision on right turn on red
        # Reason: Crossing red and accident have same penalty
        if inputs['light'] == 'red':
            if inputs['oncoming'] != 'left':
                inputs['oncoming'] = None
            if inputs['left'] != 'forward':
                inputs['left'] = None
            inputs['right'] = None

        if inputs['light'] == 'green':
            # ignore oncoming right turn
            if inputs['oncoming'] == 'right':
                inputs['oncoming'] = None
            # ignore right turn from left
            if inputs['left'] == 'right':
                inputs['left'] = None

        # Select State Space
        #state = ':'.join((inputs['light'], self.next_waypoint))
        state = ':'.join(
            (inputs['light'], str(inputs['oncoming']), str(inputs['left']),
             str(inputs['right']), self.next_waypoint))

        self.state = state
        if self.log: print "[DEBUG]: %s" % state

        if state not in self.qdf.index:
            newq = pd.DataFrame(self.qinit, index=[state])
            self.qdf = self.qdf.append(newq)

        # TODO: Select action according to your policy

        # select best action from Q matrix
        qaction = self.qdf.loc[state].idxmax()

        # random selection of action
        randaction = random.choice(self.env.valid_actions)

        # Action Policy
        sample = random.random()
        if self.log: print "[DEBUG] epsilon: %s" % self.epsilon
        if sample <= self.epsilon:
            action = randaction
            if self.log: print "[DEBUG] randomaction: %s" % action
        else:
            if qaction == 'wait':
                action = None
            else:
                action = qaction
            if self.log: print "[DEBUG] bestaction: %s" % action

        # Slowly decrease epsilon, leave at 5% floor
        if self.epsilon > self.floor:
            self.epsilon -= .00075

        # Override selection. DEBUG only !
        #action = qaction
        #action = randaction

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        if action == None:
            nowstate = self.qdf['wait'][state]
        else:
            nowstate = self.qdf[action][state]

        nextstate = nowstate * (1 - self.alpha) + self.alpha * (
            reward + self.gamma * 2.0)
        if self.log: print "[DEBUG] nextstate: %s" % nextstate

        # Update Q matrix
        if action == None:
            self.qdf['wait'][state] = nextstate
        else:
            self.qdf[action][state] = nextstate

        if self.log: print "[DEBUG] qdf:"
        if self.log: print self.qdf

        self.env.totalreward += reward
        if self.log: print "[DEBUG] totalreward: %s" % self.env.totalreward
        if self.log:
            print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(
                deadline, inputs, action, reward)  # [debug]
Exemplo n.º 46
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, trials=1):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.learning_rate = 0.9
        self.Q = {}

        # Standard default Q value of 0
        # self.default_Q = 0

        # Optimal default Q value.
        self.default_Q = 1

        # discount factor is denoted by Beta in official Bellman equation formula
        # (http://web.stanford.edu/~pkurlat/teaching/5%20-%20The%20Bellman%20Equation.pdf),
        # or gamma in Udacity course.
        self.discount_factor = 0.33
        
        # How likely are we to pick random action / explore new paths?
        self.epsilon = 0.1 

        # How many times agent able to reach the target for given trials?
        self.success = 0
        self.total = 0
        self.trials = trials

        # How many penalties does an agent get?
        self.penalties = 0
        self.moves = 0

        self.net_reward = 0

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.prev_state = None
        self.prev_action = None
        self.prev_reward = None

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        inputs['waypoint'] = self.next_waypoint

        # Tried deleting some inputs to see how it affected performance.
        # Success rate did improve significantly, and counterintuitively, less penalty.
        del inputs['oncoming']
        del inputs['left']
        del inputs['right']

        # Tried to see if deadline info would improve performance turns out it got worse.
        # inputs['deadline'] = deadline

        self.state = tuple(sorted(inputs.items()))

        # TODO: Select action according to your policy

        # Random action 
        # action = random.choice(Environment.valid_actions)

        # Q learning chosen action
        _Q, action = self._select_Q_action(self.state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Some stats
        self.net_reward += reward
        self.moves += 1
        if reward < 0:
            self.penalties+= 1

        add_total = False
        if deadline == 0:
            add_total = True
        if reward > 5:
            self.success += 1
            add_total = True
        if add_total:
            self.total += 1
            print self._more_stats()

        # TODO: Learn policy based on state, action, reward

        # Note that we are updating for the previous state's Q value since Utility function is always +1 future state.
        if self.prev_state != None:
            if (self.prev_state, self.prev_action) not in self.Q:
                self.Q[(self.prev_state, self.prev_action)] = self.default_Q

            # Update to Q matrix as described in this lesson:
            # https://www.udacity.com/course/viewer#!/c-ud728-nd/l-5446820041/m-634899057

            # WRONG! `- self.Q[(self.prev_state, self.prev_action)]` should not be there.
            # self.Q[(self.prev_state,self.prev_action)] = (1 - self.learning_rate) * self.Q[(self.prev_state,self.prev_action)] + \
            # self.learning_rate * (self.prev_reward + self.discount_factor * \
            #     self._select_Q_action(self.state)[0] - self.Q[(self.prev_state, self.prev_action)])

            # Correct method:
            self.Q[(self.prev_state,self.prev_action)] = (1 - self.learning_rate) * self.Q[(self.prev_state,self.prev_action)] + \
            self.learning_rate * (self.prev_reward + self.discount_factor * \
                self._select_Q_action(self.state)[0])


        # pdb.set_trace()
        self.prev_state = self.state
        self.prev_action = action
        self.prev_reward = reward

        self.env.status_text += ' ' + self._more_stats()

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]

    def _more_stats(self):
        """Get additional stats"""
        return "success/total = {}/{} of {} trials (net reward: {})\npenalties/moves (penalty rate): {}/{} ({})".format(
                self.success, self.total, self.trials, self.net_reward, self.penalties, self.moves, round(float(self.penalties)/float(self.moves), 2))

    def _select_Q_action(self, state):
        """Select max Q and action based on given state.

        Args:
            state(tuple): Tuple of state e.g. (('heading', 'forward'), ('light', 'green'), ...).

        Returns:
            tuple: max Q value and best action
        """
        best_action = random.choice(Environment.valid_actions)
        if self._random_pick(self.epsilon):
            max_Q = self._get_Q(state, best_action)
        else:
            max_Q = -999999
            for action in Environment.valid_actions:
                Q = self._get_Q(state, action)
                if Q > max_Q:
                    max_Q = Q
                    best_action = action
                elif Q == max_Q:
                    if self._random_pick(0.5):
                        best_action = action
        return (max_Q, best_action)


    def _get_Q(self, state, action):
        """Get Q value given state and action.

        If Q value not found in self.Q, self.default_Q will be returned instead.

        Args:
            state(tuple): Tuple of state e.g. (('heading', 'forward'), ('light', 'green'), ...).
            action(string): None, 'forward', 'left', or 'right'.
        """
        return self.Q.get((state, action), self.default_Q)

    def _random_pick(self, epsilon=0.5):
        """Random pick with epsilon as bias.

        The larger epsilon is, the more likely it is to pick random action.

        Explanation is available at this course:
        https://www.udacity.com/course/viewer#!/c-ud728-nd/l-5446820041/m-634899065 starting at 2:10.

        One may consider this function as: When True, do random action.

        For equal chance, use epsilon = 0.5

        Args:
            epsilon(float): Likelihood of True.
        """
        return random.random() < epsilon
Exemplo n.º 47
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.t = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0

        # self.epsilon *= 0.995
        self.epsilon = math.cos(float(self.t) / 500)
        self.alpha = 0.5
        self.t += 1

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the
            environment. The next waypoint, the intersection inputs, and the deadline
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        # ex. waypoint = ['forward', 'left', 'right']
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        # ex. inputs = {'light': 'red', 'oncoming': None, 'left': 'right', 'right': 'forward'}
        deadline = self.env.get_deadline(self)  # Remaining deadline
        # ex. deadline =

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left'],
                 inputs['right'])

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        maxQ = max(self.Q[state].values())

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        if self.learning:
            self.Q.setdefault(state,
                              {action: 0.0
                               for action in self.valid_actions})

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        probability = random.random()
        if not self.learning:
            action = random.choice(self.valid_actions)
        else:
            if self.epsilon > probability:
                action = random.choice(self.valid_actions)
            else:
                maxQ = self.get_maxQ(state)
                action_maximum = [
                    action for action, Q in self.Q[state].iteritems()
                    if Q == maxQ
                ]
                action = random.choice(action_maximum)

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')

        # use following function to update q value
        # Q(S,A) - (1-alpha)*Q(S,A) + alpha*[R + gamma*maxQ(S',a)]
        # gamma = 0

        # Q_next = self.get_maxQ(state) # tuple (action , qvalue)
        Q_actual = self.Q[state][action]  # qvalue
        self.Q[state][action] = (1 -
                                 self.alpha) * Q_actual + self.alpha * (reward)

        return

    def update(self):
        """ The update function is called when a time step is completed in the
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Exemplo n.º 48
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the first step at making a Q-Learning Agent which begins to learn. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Exploration rate
        self.alpha = alpha  # Learning rate
        self.gamma = 1.0  # Discount rate

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.trial_count = 0

    # def getDecay(self, decay):

    # self.epsilon = 1 / math.pow(self.trial_count, 2)
    # self.epsilon = math.exp(1) ** (-self.alpha * self.trial_count)
    # self.epsilon - math.cos(a * self.trial_count)
    # self.epsilon = self.epsilon / math.sqrt(self.trial_count)
    # self.epsilon = 1 / math.pow(self.trial_count, 2)

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.trial_count += 1
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0

        # self.epsilon -= 0.05
        if self.epsilon > 0:
            self.epsilon = math.pow(self.alpha, self.trial_count)
        if testing:
            self.epsilon = 0.0
            self.alpha = 0.0

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the
            environment. The next waypoint, the intersection inputs, and the deadline
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        # deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        state = (waypoint, inputs['light'], inputs['oncoming'])
        return state

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        #
        if self.learning:
            if state not in self.Q:
                self.Q[state] = dict()
                for action in self.valid_actions:
                    self.Q[state][action] = 0.0
        return

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        maxQ = max(self.Q[state].values())
        return maxQ

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        action = None
        if self.learning:
            if random.random < self.epsilon:
                # choose action with 'epsilon probability'
                action = random.choice(self.valid_actions)
            else:
                action = random.choice([
                    i for i in self.Q[self.state].keys()
                    if self.Q[self.state][i] == self.get_maxQ(self.state)
                ])
                # bestQ = None
                # bestAction = None
                # for action in self.valid_actions:
                #     thisQ = self.Q[state][action]
                #     if thisQ > bestQ:
                #         bestQ = thisQ
                #         bestAction = action
                # action = bestAction

        else:
            # choose random action
            action = random.choice(self.valid_actions)

        return action

        # if self.learning:
        #     if random.random() < self.epsilon:
        #         action = random.choice(self.valid_actions)
        #     else:

        #         action = random.choice([i for i in self.Q[self.state].keys() if self.Q[self.state][i] == self.get_maxQ(self.state)])
        # else:
        #     action = random.choice(self.valid_actions)

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')

        if self.learning:
            self.Q[state][action] = (
                1 - self.alpha) * self.Q[state][action] + (reward * self.alpha)
        return

    def update(self):
        """ The update function is called when a time step is completed in the
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Exemplo n.º 49
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):

        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor
        self.gamma = 0.9  # Discount factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.trial_step = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0

        if testing:
            self.epsilon = 0
            self.alpha = 0
        else:
            self.trial_step += 1
            #Default
            #self.epsilon = self.epsilon - 0.05

            # Gave excellent results
            self.epsilon = 0.95**self.trial_step

            #self.epsilon = 1/(self.trial_step**2)
            #self.epsilon = math.cos(self.trial_step/19.0)
            #print "*************{}*************".format(self.trial_step)
            #self.gamma = math.pow(0.9,self.trial_step)
        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        #deadline seems relevant but dramatically increases state space.
        state = (waypoint, inputs['light'], inputs['oncoming'])
        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        """ THIS CODE RETURNS ACTION ASSOCIATED WITH maxQ value
        maxQ_val = 0
 		#Choose action with max Q value
        for act in self.Q[state]:
            print "Action = {}	Q_val = {}".format(act,self.Q[state][act])
            if self.Q[state][act] > maxQ_val:
 				action = act
 				maxQ_val = self.Q[state][act]
 				print "Action with > Q"
 				
        if maxQ_val == 0:
 			action = random.choice(self.valid_actions)
 			print "Random Action because Q all 0" 		
        """
        maxQ = max(self.Q[state].values())
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning == True:
            if self.Q.get(state) == None:
                self.Q[state] = {a: 0 for a in self.valid_actions}
                #print "Creating state Actions: "
                #for acts in self.Q[state]:
                #	print "actions = {}	Q vals = {}".format(acts, self.Q[state][acts])
            else:
                pass  #print "State previously visited"

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        #for debugging purposes
        randy = random.random()
        if ((self.learning == False) or (randy < self.epsilon)):
            action = random.choice(self.valid_actions)
            #print "randy < epsilon; {} < {}".format(randy, self.epsilon)
            print "Random Action Epsilon"
        else:
            print "Getting Max Q"
            maxQ = self.get_maxQ(state)
            print "maxQ = {}".format(maxQ)
            # Kind of clever solution but only returns 1 key/action
            #action = self.Q[state].keys()[self.Q[state].values().index(maxQ)]
            maxQ_actions = []
            for action, q_val in self.Q[state].items():
                if q_val == maxQ:
                    maxQ_actions.append(action)

            print "bucha actions = {}".format(maxQ_actions)
            #print "type of thing = {}".format(type(maxQ_actions))
            action = random.choice(maxQ_actions)
            print action

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        #self.get_maxQ(state)
        # This implementation does not use gamma... but if it did
        # self.Q[state][action] += self.alpha(reward + gamma * argmax(R(s',a')) - self.Q[state][action])
        if self.learning == True:
            self.Q[state][action] += self.alpha * (reward -
                                                   self.Q[state][action])
            print "new Q = {}".format(self.Q[state][action])

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Q = {}

        # Discount
        # self.gamma = 0.8
        self.gamma = 0.5
        # self.gamma = 0.1
        # self.gamma = 0.5

        self.epsilon = 0.95  # <===
        # self.epsilon = 0.75

        # self.epsilon = 0.5

        # self.epsilon = 0.5
        # self.epsilon = 0.05
        # self.epsilon = 0.25
        # self.epsilon = 0.99
        # self.epsilon = 0.95

        # Learning Rate
        # self.alpha = 1.
        # self.alpha = 0.8  # <===
        self.alpha = 0.5

        # self.alpha = 0.25
        # self.alpha = 0.5

        self.trips_rewards = {}
        self.trips_timers = {}
        self.trips_steps = {}
        self.trips_finished = {}

        self.trip_reward = 0
        self.total_reward = 0
        self.total_finished = 0

        self.current_step = 1
        self.total_steps = 1
        self.trip_number = 0

        # self.max_trips = 20
        self.max_trips = 100

        self.valid_actions = [None, 'forward', 'left', 'right']

        print ''
        print CSI + "7;30;43m" + "Welcome to SmartCab simulation" + CSI + "0m"
        print ''
        print '=================================================='
        print '-------------------- config ----------------------'
        print 'gamma: ' + str(self.gamma)
        print 'epsilon: ' + str(self.epsilon)
        print 'alpha: ' + str(self.alpha)
        print '--------------------------------------------------'
        print ''

        self.timer = time.time()

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.trip_number += 1
        self.current_step = 0
        self.trip_reward = 0
        self.trip_begin_time = time.time()

        if self.trip_number > self.max_trips:  # 20
            # Recording results to log.txt
            print 'self.results_log'
            print self.results_log

            # # Output to the log file
            # f = open('log.txt','ab+')
            # f.write(self.results_log)
            # f.write("\n")
            # f.close()

            # sys.exit(0) # exit if reached the self.max_trips from config

        print '[SmartCab] New trip - awesome!'

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        state = {}
        state['action'] = self.next_waypoint  # desired action
        state['light'] = inputs['light']
        state['oncoming_traffic'] = inputs['oncoming']
        state['traffic_on_the_left'] = inputs['left']
        state = tuple(state.items())
        self.state = state

        print ''
        print '=================================================='
        print '-------------------- stats -----------------------'
        print 'trip_number: ' + str(self.trip_number)
        print 'current_step: ' + str(self.current_step)
        print 'deadline: ' + str(deadline)
        print 'total_steps: ' + str(self.total_steps)
        print 'trip_reward: ' + str(self.trip_reward)
        print 'total_reward: ' + str(self.total_reward)
        print 'total_reward/total_steps: ' + str(
            self.total_reward / self.total_steps)
        print 'timer: ' + str(round(time.time() - self.timer, 2))
        print '-------------------- STATE -----------------------'
        # ap(state)
        print state
        print '--------------------------------------------------'

        # TODO: Select action according to your policy
        # Select action based on previous outcomes

        # Check if this state occured before
        print '[SmartCab] Check if this state occured before'
        if state in self.Q:
            print '=> State is found in Q-table'
            print '=> State occured ' + str(
                self.Q[state]['count']) + ' time(s)'

            print '[SmartCab] Compare the value of pre configurated parameter epsilon with random value: '
            random_value = random.random()

            if random_value > self.epsilon:
                print '=> random_value: ' + str(
                    random_value) + ' > epsilon: ' + str(self.epsilon)
                action = random.choice(self.valid_actions)
                print '=> Taking random action: ' + str(action)
            else:
                print '=> random_value: ' + str(
                    random_value) + ' < epsilon: ' + str(self.epsilon)
                print '[SmartCab] Find available_actions with maximum Q^ score:'
                available_actions = [
                    k for k, v in self.Q[state]['actions_Q^'].items()
                    if v == max(self.Q[state]['actions_Q^'].values())
                ]
                print '=> ' + str(available_actions)
                action = random.choice(available_actions)

                print '[SmartCab] Take random action from the list of available_actions with max Q^: ' + str(
                    action)

            # # ======================================================
            # # Alternative model: Take action in the direction of Waypoint if reward > 0 else wait (None)
            # # ------------------------------------------------------
            # # Check previous reward for desired action ( self.next_waypoint ) in similar state ( state )
            # if self.Q[state]['reward'] > 0:
            #     action = self.next_waypoint
            # else:
            #     action = None
            # # ======================================================

        else:
            print '=> State NOT found in Q-table'

            print '[SmartCab] Assigning default values for this state inside Qtable'

            self.Q[state] = {}
            self.Q[state]['count'] = 0
            self.Q[state]['actions_rewards'] = {}
            self.Q[state]['actions_counts'] = {}
            self.Q[state]['actions_Q^'] = {}

            # action = random.choice(self.valid_actions)

            # Alternative:
            # 'None' may be removed to motivate exploring (as was advised by udacity review):
            action = random.choice(['forward', 'left', 'right'])

            # action = self.next_waypoint
            print '=> Taking random action: ' + str(action)

        # # ======================================================
        # # Alternative model: Take random action
        # # ------------------------------------------------------
        # action = random.choice(self.valid_actions)
        # print '=> Taking random action: ' + str(action)
        # # ------------------------------------------------------

        # Count the occurance of current state
        self.Q[state]['count'] += 1

        # Execute action and get reward
        print '--------------------------------------------------'
        reward = self.env.act(self, action)
        print '--------------------------------------------------'
        print '=> Reward: ' + str(reward)

        if reward == 12:
            self.trips_finished[self.trip_number] = 1
            self.total_finished += 1
        else:
            self.trips_finished[self.trip_number] = 0

        print ''
        print '[SmartCab] Calculating and recording current results to Qtable'

        # TODO: Learn policy based on state, action, reward
        if action in self.Q[state]['actions_rewards']:
            self.Q[state]['actions_rewards'][action] = reward
            self.Q[state]['actions_counts'][action] += 1

            Qh = self.Q[state]['actions_Q^'][action]
            Qh = Qh + (self.alpha *
                       (reward +
                        (self.gamma *
                         (max(self.Q[state]['actions_Q^'].values()))) - Qh))

            # # # ======================================================
            # # # Alternative models: Q^ = 0.5 ; Q^ = 1 ; Q^ = reward
            # # # ------------------------------------------------------
            # Qh = 0.5
            # Qh = 1
            # Qh = reward
            # # # ------------------------------------------------------

            self.Q[state]['actions_Q^'][action] = Qh

        else:
            self.Q[state]['actions_rewards'][action] = reward
            self.Q[state]['actions_counts'][action] = 1
            # self.Q[state]['actions_Q^'][action] = 0.5
            # self.Q[state]['actions_Q^'][action] = 1
            # self.Q[state]['actions_Q^'][action] = reward
            # self.Q[state]['actions_Q^'][action] = 0.0
            # self.Q[state]['actions_Q^'][action] = 2.5
            # self.Q[state]['actions_Q^'][action] = 2.0
            self.Q[state]['actions_Q^'][action] = 1.5
            # self.Q[state]['actions_Q^'][action] = 1.0
            # self.Q[state]['actions_Q^'][action] = 0.5

        # Reward for driving in the direction of way point
        self.Q[state]['reward'] = reward

        print '--------------------------------------------------'
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(
            deadline, inputs, action, reward)  # [debug]

        self.total_reward += reward
        self.trip_reward += reward
        self.current_step += 1
        self.total_steps += 1

        print '-----------------self.Q[state]--------------------'

        # ap(self.Q[state])
        print self.Q[state]

        print '=================trips_results===================='
        self.trips_steps[self.trip_number] = self.current_step
        print 'Steps: ' + str(self.trips_steps)

        self.trips_rewards[self.trip_number] = self.trip_reward
        print 'Rewards: ' + str(self.trips_rewards)

        self.trips_timers[self.trip_number] = round(
            time.time() - self.trip_begin_time, 2)
        print 'Timers: ' + str(self.trips_timers)

        print 'Finished' + str(self.trips_finished)
        # print 'Timers: ' + str(self.trips_timers)

        self.results_log = 'Total steps: ' + str(
            self.total_steps) + '; reward: ' + str(
                self.total_reward) + '; finished: ' + str(
                    self.total_finished) + '; seconds: ' + str(
                        round(time.time() - self.timer, 4))
        print self.results_log
        print '=================================================='
Exemplo n.º 51
0
class LearningAgent_v0(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent_v0, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.gamma = 0.2  #upper bound of discount
        self.alpha = 0.1  #uppder bound of learning rate
        self.epsilon = 0.2  #lower bound of proportion of random steps
        #self.state = {'deadline': None, 'forward_ok': True, 'left_ok': True, 'right_ok': True, 'next_waypoint': None }
        self.state = {
            'light': 'green',
            'oncoming': None,
            'left': None,
            'right': None,
            'next_waypoint': None
        }
        self.exreward = 0
        #self.exstate = {'deadline': None, 'forward_ok': True, 'left_ok': True, 'right_ok': True, 'next_waypoint': None }
        self.exstate = {
            'light': 'green',
            'oncoming': None,
            'left': None,
            'right': None,
            'next_waypoint': None
        }
        self.exaction = None
        self.debug = False
        self.trip_history = []

        self.reward_history = np.zeros(1)

        self.Q = OrderedDict()

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = {
            'light': 'green',
            'oncoming': None,
            'left': None,
            'right': None,
            'next_waypoint': None
        }
        self.exreward = 0
        self.exstate = {
            'light': 'green',
            'oncoming': None,
            'left': None,
            'right': None,
            'next_waypoint': None
        }
        self.exaction = None

        if (len(self.trip_history) < 150):
            print 'Current success rate is {}'.format(
                sum(self.trip_history) / (len(self.trip_history) + 0.000001))
        else:
            print 'Success rate for recent 100 trials is {}'.format(
                sum(self.trip_history[-100:]) /
                (len(self.trip_history[-100:]) + 0.000001))
        print 'Average reward for recent moves is {}'.format(
            np.mean(self.reward_history))
        if (self.reward_history.shape[0] > 1000):
            self.reward_history = np.delete(self.reward_history, range(100))

        #print "number of parameter is {0}, sum of Qfunction is {1}".format( len(self.Q.keys()), sum(self.Q.values()) )

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        #may need to take deadline into account?

        # TODO: Update state
        #self.state = {'inputs': inputs, 'deadline': deadline, 'next_waypoint':self.next_waypoint}
        #self.state = {'inputs': inputs, 'next_waypoint':self.next_waypoint}

        #epsilon = self.epsilon + (1-self.epsilon)/(t+1)*5
        #gamma = ( 1- 10/(t+10) ) * self.gamma
        #alpha = self.alpha/(t+1.0)
        gamma = self.gamma
        epsilon = self.epsilon
        alpha = self.alpha

        self.state['next_waypoint'] = self.next_waypoint
        #self.state['deadline'] = int(deadline>5) + int(deadline>25)

        for k in inputs.keys():
            self.state[k] = inputs[k]

        # TODO: Select action according to your policy
        #action = random.choice(Environment.valid_actions[1:])

        newkey = str(self.exstate.values()) + ':' + str(self.exaction)

        if (self.debug):
            print "\n New key is {}".format(newkey)
            #debug
            raw_input("Press Enter to continue...")

        tmp_Q = dict([(x, self.Q[x]) for x in self.Q.keys()
                      if str(self.state.values()) in x])

        #print tmp_Q

        if self.debug:
            print "\n Q value for future state is {}".format(str(tmp_Q))
            #debug
            raw_input("Press Enter to continue...")

        action = random.choice(Environment.valid_actions[:])
        tmp_max_Q = 0
        if (len(tmp_Q) == 0):
            tmp_max_Q = 0
            action = random.choice(Environment.valid_actions[:])
        else:
            #tmp_idx = max(tmp_Q)
            tmp_idx = max(tmp_Q.iterkeys(), key=(lambda key: tmp_Q[key]))
            tmp_max_Q = tmp_Q[tmp_idx]
            if (tmp_max_Q > 0 or len(tmp_Q) == 4):
                #print tmp_idx
                tmp_Q_split = tmp_idx.split(':')
                #print tmp_Q_split
                #print tmp_Q_split
                action = tmp_Q_split[1]
                if action == 'None':
                    action = None
            else:
                exist_actions = [x.split(':')[1] for x in tmp_Q.keys()]
                all_actions = ['None', 'forward', 'left', 'right']
                remaining_actions = [
                    x for x in all_actions if not (x in exist_actions)
                ]
                if self.debug:
                    print "Remaining actions are {}".format(
                        str(remaining_actions))
                action = random.choice(remaining_actions)
                tmp_max_Q = 0
                if action == 'None':
                    action = None

        if self.debug:
            print "\n future optimum action is {}".format(str(action))
            #debug
            raw_input("Press Enter to continue...")

        if (random.uniform(0, 1) < epsilon):
            action = random.choice(Environment.valid_actions[:])
            if self.debug:
                print "\n Instead use a random action, {}".format(str(action))
                #debug
                raw_input("Press Enter to continue...")

        #print 'now ' + str(action)
        #random guess have success rate about ~0.20
        #action = random.choice(Environment.valid_actions[:])

        newval = self.exreward + gamma * tmp_max_Q

        if self.debug:
            print "\n current reward is {0}, gamma is {1}, and estimated max future Q is {2}".format(
                self.exreward, gamma, tmp_max_Q)
            #debug
            raw_input("Press Enter to continue...")

        if newkey in self.Q.keys():
            self.Q[newkey] = self.Q[newkey] * (1 - alpha) + alpha * newval
        else:
            self.Q[newkey] = self.alpha * newval

        if self.debug:
            print "updated Q values {}".format(str(self.Q))
            #debug
            raw_input("Press Enter to continue...")

        #print t
        # Execute action and get reward
        reward = self.env.act(self, action)

        if reward > 9:
            self.trip_history.append(1)
            #need to write down something here
            newkey = str(self.state.values()) + ':' + str(action)
            newval = reward  # + deadline
            if newkey in self.Q.keys():
                self.Q[newkey] = self.Q[newkey] * (1 - alpha) + alpha * newval
            else:
                self.Q[newkey] = self.alpha * newval
        elif deadline == 0:
            self.trip_history.append(0)
            newkey = str(self.state.values()) + ':' + str(action)
            newval = reward
            if newkey in self.Q.keys():
                self.Q[newkey] = self.Q[newkey] * (1 - alpha) + alpha * newval
            else:
                self.Q[newkey] = self.alpha * newval

        # TODO: Learn policy based on state, action, reward
        self.exreward = reward
        self.exstate = self.state
        self.exaction = action
        self.reward_history = np.append(self.reward_history, reward)
Exemplo n.º 52
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Q_table = {}
        self.gamma = 0.8
        self.alpha = 0.5
        self.epsilon = 0.1
        self.epsilon_decay_rate = 0.0001
        self.valid_actions = [None, 'forward', 'left', 'right']

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    # build a state by taking parameters from inputs and next_way point
    def return_state(self, inputs):
        return (
            inputs['light'],
            inputs['oncoming'],
            inputs['left'],
            # inputs['right'],
            self.next_waypoint,
            # self.env.get_deadline(self)
        )

    # read Q_value from Q_table based on state and action
    def get_Q_value(self, state, action):
        if (state, action) in self.Q_table:
            return self.Q_table[(state, action)]
        else:
            return 0

    # return the max Q_value based on the state and all possible actions
    def get_Max_Q(self, state):
        max_Q_value = 0
        for action in self.valid_actions:
            if max_Q_value < self.get_Q_value(state, action):
                max_Q_value = self.get_Q_value(state, action)
        return max_Q_value

    # update Q_value in Q_Table
    def update_Q_value(self, state, action, reward):
        exist_Q_value = self.get_Q_value(state, action)
        self.next_waypoint = self.planner.next_waypoint()
        next_state = self.return_state(self.env.sense(self))
        Q_value = exist_Q_value + self.alpha * (
            reward + self.gamma * self.get_Max_Q(next_state) - exist_Q_value)
        self.Q_table[(state, action)] = Q_value

    # define action policy that take the action which result in the max Q_value from the current state
    def action_policy(self, state):
        action_to_take = None
        best_Q_value = 0
        for action in self.valid_actions:
            if self.get_Q_value(state, action) > best_Q_value:
                best_Q_value = self.get_Q_value(state, action)
                action_to_take = action
            elif self.get_Q_value(state, action) == best_Q_value:
                action_to_take = action
        return action_to_take

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # get the state parameters and display them.
        self.state = self.return_state(inputs)

        # TODO: Select action according to your policy

        # action  = random.choice(Environment.valid_actions[1:])

        # select action according to the policy with greedy strategy
        if self.epsilon > random.random():
            action = random.choice(self.valid_actions)
        else:
            action = self.action_policy(self.state)
            self.epsilon -= self.epsilon_decay_rate

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        self.update_Q_value(self.state, action, reward)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(
            deadline, inputs, action, reward)  # [debug]
Exemplo n.º 53
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Q = defaultdict(list)
        self.alphaLearn = .3
        self.gammaDiscount = .99
        self.stateAction = None
        self.epsilon = .1
        self.penalty = 0
        self.oldPenalty = None

    def reset(self, destination=None):
        self.stateAction = None
        self.planner.route_to(destination)
        print 'penalty = {}'.format(
            self.penalty if self.oldPenalty is None else (self.penalty -
                                                          self.oldPenalty))
        self.oldPenalty = self.penalty
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = [
            inputs['light'], inputs['right'], inputs['left'],
            self.next_waypoint
        ]
        # print 'this is waypoint {}'.format(self.next_waypoint)
        # TODO: Select action according to your policy

        action = None
        maxQ = 0
        if random.random() < self.epsilon:
            action = random.choice([None, 'forward', 'left', 'right'])
        else:
            for k, v in self.Q.items():
                if k[0:3] == self.state and v > maxQ:
                    action = k[4]
                    maxQ = v
            if action == None:
                action = random.choice([None, 'forward', 'left', 'right'])

        # Execute action and get reward
        reward = self.env.act(self, action)
        if reward < 0:
            self.penalty += 1
            # print 'violation', self.penalty

        # TODO: Learn policy based on state, action, reward
        if self.stateAction is not None:
            if self.Q[tuple(self.stateAction)]:
                self.Q[tuple(self.stateAction)] = self.Q[tuple(
                    self.stateAction)] + self.alphaLearn * (
                        reward + self.gammaDiscount * maxQ -
                        self.Q[tuple(self.stateAction)])
            else:
                self.Q[tuple(self.stateAction)] = self.alphaLearn * (
                    reward + self.gammaDiscount * maxQ)
        else:
            print 'Q learning start'

        #print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]

        self.stateAction = [
            inputs['light'], inputs['right'], inputs['left'],
            self.next_waypoint, action
        ]
Exemplo n.º 54
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'taxi'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here

        # Simple statistical counter variables
        self.trial_count = 0
        self.trial_summary = {}
        for i in range(2):
            self.trial_summary[i] = 0
        self.cycles = 0
        self.time_count_pos = {}
        self.time_count_neg = {}
        self.deadline_start_col = [0] * 100
        self.deadline_end_col = [0] * 100

    def deadline_stats(self, start, deadline):
        if (start):
            self.deadline_start_col.append(deadline)
        elif not start:
            self.deadline_end_col.append(deadline)

    def success_stats(self, suc):
        if (suc):
            self.trial_summary[1] += 1
            self.time_count_pos[self.cycles] += 1
        else:
            self.trial_summary[0] += 1
            self.time_count_neg[self.cycles] += 1

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.trial_count = 0
        self.cycles += 1

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state

        # TODO: Select action according to your policy
        action = random.choice([None, 'forward', 'left', 'right'])

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(
            deadline, inputs, action, reward)  # [debug]
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Q = {}

        # Discount
        # self.gamma = 0.8
        self.gamma = 0.5
        # self.gamma = 0.1
        # self.gamma = 0.5

        self.epsilon = 0.95  # <===
        # self.epsilon = 0.75

        # self.epsilon = 0.5

        # self.epsilon = 0.5
        # self.epsilon = 0.05
        # self.epsilon = 0.25
        # self.epsilon = 0.99
        # self.epsilon = 0.95

        # Learning Rate
        # self.alpha = 1.
        # self.alpha = 0.8  # <===
        self.alpha = 0.5

        # self.alpha = 0.25
        # self.alpha = 0.5

        self.trips_rewards = {}
        self.trips_timers = {}
        self.trips_steps = {}
        self.trips_finished = {}

        self.trip_reward = 0
        self.total_reward = 0
        self.total_finished = 0

        self.current_step = 1
        self.total_steps = 1
        self.trip_number = 0

        # self.max_trips = 20
        self.max_trips = 100

        self.valid_actions = [None, 'forward', 'left', 'right']

        print ''
        print CSI + "7;30;43m" + "Welcome to SmartCab simulation" + CSI + "0m"
        print ''
        print '=================================================='
        print '-------------------- config ----------------------'
        print 'gamma: ' + str(self.gamma)
        print 'epsilon: ' + str(self.epsilon)
        print 'alpha: ' + str(self.alpha)
        print '--------------------------------------------------'
        print ''

        self.timer = time.time()
Exemplo n.º 56
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    # Learning rate
    ALPHA = 0.66
    # Discount factor
    GAMMA = 0.003

    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.state_prev = None
        self.action_prev = None
        self.reward_prev = None
        # Initialize Q-values w random values
        self.Q = {}
        for nwp, l, o, a in product(self.env.valid_actions, ('green', 'red'),
                                    self.env.valid_actions,
                                    self.env.valid_actions):
            self.Q[LAState(nwp, l, o), a] = random.uniform(1, 5)
        # Keep track of how many times the primary agent reached destination
        self.num_reached = 0

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state_prev = None
        self.action_prev = None
        self.reward_prev = None

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        state = LAState(next_waypoint=self.next_waypoint,
                        light=inputs['light'],
                        oncoming=inputs['oncoming'])

        # TODO: Select action according to your policy
        # Basic driving agent
        #action = random.choice(self.env.valid_actions)
        # Action with highest Q score for current state
        action = max(self.env.valid_actions, key=lambda a: self.Q[state, a])

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        if self.state_prev is not None:
            self.Q[self.state_prev, self.action_prev] = \
                (1-self.ALPHA) * self.Q[self.state_prev, self.action_prev] + \
                self.ALPHA * (self.reward_prev + self.GAMMA * self.Q[state, action])

        # Update previous values for the next iteration
        self.state_prev = state
        self.action_prev = action
        self.reward_prev = reward

        # Update number of times that agent reached destination
        self.num_reached += self.env.done

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(
            deadline, inputs, action, reward)  # [debug]
Exemplo n.º 57
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)
        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing == True:
            self.epsilon = 0
            self.aplha = 0
        else:
            self.epsilon = self.epsilon - 0.005
        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        state = (waypoint, inputs["light"], inputs["oncoming"], inputs["left"])

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        if state not in self.Q:
            maxQ = None
        else:
            maxQ = max(self.Q[state].values())
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning == True:
            if state not in self.Q:
                self.Q[state] = {
                    None: 0.0,
                    "forward": 0.0,
                    "left": 0.0,
                    "right": 0.0
                }
                self.Q[state] = {action: 0.0 for action in self.valid_actions}

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None
        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state
        if self.learning == False or random.random() < self.epsilon:
            action = self.valid_actions[random.randint(0, 3)]
        else:
            maxQval = self.get_maxQ(state)
            best_actions = []
            for action in self.valid_actions:
                if self.Q[state][action] == maxQval:
                    best_actions.append(action)
            action = random.choice(best_actions)
        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            self.Q[state][action] = (
                1 - self.alpha) * self.Q[state][action] + self.alpha * (reward)

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Exemplo n.º 58
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.train_no = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        import math
        self.train_no = self.train_no + 1
        # self.epsilon = math.e**(-0.01 * self.train_no)
        # self.alpha = math.e**(-0.001 * (self.train_no))
        # self.epsilon = self.epsilon - 0.05*self.train_no
        # self.alpha = self.alpha - 0.05*self.train_no
        # self.alpha -= 0.005

        # for Question 5, decay function
        self.epsilon -= 0.05

        # for Question 6, use exponential function
        # self.epsilon = 0.7**self.train_no
        # results
        # Trial Aborted!
        # Agent did not reach the destination.

        # for Question 6, use 1/train_no/train_no
        # self.epsilon = 1/self.train_no **2
        # results
        # Trial Aborted!
        # Agent did not reach the destination.

        # for Question 6, use exponential function
        # self.epsilon = math.e**(-0.01 * self.train_no)
        # self.alpha = math.e**(-0.001 * (self.train_no))did not use
        # results
        # Trial Completed!
        # Agent reached the destination.

        if testing:
            self.epsilon = 0
            self.alpha = 0

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the
            environment. The next waypoint, the intersection inputs, and the deadline
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########

        # NOTE : you are not allowed to engineer eatures outside of the inputs available.
        # Because the aim of this project is to teach Reinforcement Learning, we have placed
        # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn
        # about the balance between exploration and exploitation.
        # With the hand-engineered features, this learning process gets entirely negated.

        # Set 'state' as a tuple of relevant data for the agent
        state = (inputs['light'], inputs['oncoming'], inputs['left'], waypoint)

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        # q = [self.Q.get((state,action), 0.0) for action in self.valid_actions]
        q = [self.Q[state][action] for action in self.Q[state]]
        if q:
            maxQ = max(q)
        else:
            maxQ = 0.0
        #maxAction = max(self.Q[state], key=self.Q[state].get)

        #return self.Q[state][maxAction]
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        # Then, for each action available, set the initial Q-value to 0.0
        if self.learning:
            #if self.Q.get(state, None) is None:
            if state not in self.Q:
                self.Q[state] = {}
                for action in self.valid_actions:
                    # self.Q[(state,action)] = 0.0
                    self.Q[state][action] = 0.0

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie".
        if not self.learning:
            action = random.choice(self.valid_actions)
        else:
            if random.random() < self.epsilon:
                action = random.choice(self.valid_actions)
            else:
                # maxQ = self.get_maxQ(state)
                # maxActions = []
                # for action_key in self.Q[state]:
                #     if self.Q[state][action_key] == maxQ:
                #         maxActions.append(action_key)
                #action = random.choice(maxActions)
                maxQ = self.get_maxQ(state)
                max_actions = []
                for action in self.Q[state]:
                    if self.Q[state][action] == maxQ:
                        max_actions.append(action)
                action = random.choice(max_actions)

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives a reward. This function does not consider future rewards
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        # Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            self.Q[state][action] = self.Q[state][action] * (
                1 - self.alpha) + reward * self.alpha
        return

    def update(self):
        """ The update function is called when a time step is completed in the
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Exemplo n.º 59
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """

    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent, self).__init__(env)  # Set the agent in the environment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict()  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.total_trials = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ########### 
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice

        # self.epsilon = self.epsilon - 0.05
        self.epsilon = max(self.epsilon - 0.01, 0.0)
        self.total_trials += 1
        # self.epsilon = 1.5 * math.exp(-0.6 * self.total_trials)
        # self.epsilon = math.cos(0.5 * self.total_trials)
        # self.epsilon = 1.0 / float(self.total_trials * self.total_trials)
        # self.epsilon = math.pow(0.75, self.total_trials)



        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing:
            self.epsilon = 0
            self.alpha = 0

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ########### 
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        # When learning, check if the state is in the Q-table
        #   If it is not, create a dictionary in the Q-table for the current 'state'
        #   For each action, set the Q-value for the state-action pair to 0

        # state = None
        state = waypoint, inputs['light'], inputs['oncoming'] , inputs['left'] #, inputs['right']

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ########### 
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        actions = self.Q[state]
        maxQ = max(actions.values())
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ########### 
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        if self.learning and state not in self.Q.keys():
            self.Q[state] = {None: 0.0, 'left': 0.0, 'right': 0.0, 'forward': 0.0}

        return


    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()

        ########### 
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        if not self.learning:
            action = random.choice(self.valid_actions)
        else:
            if random.random() < self.epsilon:
                action = random.choice(self.valid_actions)
            else:
                maxQ = self.get_maxQ(state)
                candidate_actions = [action for action in self.valid_actions if self.Q[state][action] == maxQ]
                action = random.choice(candidate_actions)
        return action



    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ########### 
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')

        # we want to use the alpha update: v = (1 - alpha) v + alpha * x
        if self.learning:
            previous = self.Q[state][action]
            self.Q[state][action] = (1.0 - self.alpha) * previous + self.alpha * reward

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Exemplo n.º 60
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, epsilon=0.8, alpha=0.8, gamma=0.9,tno=1):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
	#self.q = {} #initialize q table dictionary
	self.epsilon = epsilon  #exploration rate
	self.alpha = alpha  # learning rate
	self.gamma = gamma   #discount rate
	#self.lesson_counter = 0  #counts number of steps learned
	#self.steps_counter = 0 #counts steps in the trial
        self.reward_previous = None
        self.action_previous = None
        self.state_previous = None
	#self.q = {(('red',None),'right'):10,(('green',None),'forward'):15, (('green',None),'left'):20,(('red',None),'left'):25}
	self.q = {}
	self.tno = 0
	#self.success = []
	
    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
	#self.steps_counter = 0  #after finishing every trial counter gets 0
	self.tno += 1 


    def getQ(self, state, action):
        return self.q.get((state, action), 10.0)  #helper function to get the q-value from the q-table.
	

 
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
	location = self.env.agent_states[self]["location"] 
	destination = self.env.agent_states[self]["destination"]
	trial_num = t
	#self.tno += 1
	#sucess = self.env.sucess()

        # TODO: Update state
	self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)
        
        # TODO: Select action according to your policy
        #action = None
	self.actions = [None, 'left', 'right', 'forward']
	#action = self.next_waypoint
	#action = random.choice(Environment.valid_actions)
	#if random.random() < self.epsilon:    #exploration e-greedy strategy
	if random.random() < self.epsilon * 1./self.tno:    #exploration e-greedy strategy

    		action = random.choice(self.actions)
	else:
		q = [self.getQ(self.state, a) for a in self.actions] # gives out list q-values for the visited state-action pairs
		maxQ = max(q) # get the maximum values of q-values from the above list
		count = q.count(maxQ)  #count the number of maximum values
		if count > 1:
            		best = [i for i in range(len(self.actions)) if q[i] == maxQ]
            		i = random.choice(best)  #select the action randomly from the maximum q-values index 
    		else:
        		i = q.index(maxQ)


		action = self.actions[i]
	#action = random.choice(Environment.valid_actions)

#decay rate
#Environment.valid_actions
#defaultdict


        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

	if t == 0:
		oldv = self.q.get((self.state,action), 10.0)
		self.q[(self.state, action)] = oldv   #if it's first counter run in the trial then get q-value
	else:
		oldv = self.q.get((self.state_previous, self.action_previous), 0.0)   #if it's second counter run in the trial then get the q-value from previous state-action  	
		self.q[(self.state,action)] = oldv + self.alpha * (reward - oldv + self.gamma * self.q.get((self.state, action), 10.0))
		#try:
		#	self.q[(self.state,action)] = oldv + self.alpha * (reward - oldv + self.gamma * self.q[(self.state,action)])
		#except Exception:
		#	self.q[(self.state,action)] = reward + 10   #if the state has not been visited before then except statement is triggered otherwise try statement is triggered. 
		#if oldv is None:
		#	self.q[(self.state,action)] = reward
		#else:
			#self.q[(self.state,action)] = oldv + self.alpha * (reward - oldv)# + self.gamma * self.q[(self.state, action)] - oldv)
			#self.q[(self.state,action)] = oldv + self.alpha * (reward - oldv + self.gamma)# * self.q[(self.state, action)] - oldv)
	

	uQ = self.q
	self.state_previous = self.state
        self.action_previous = action
        #self.steps_counter += 1
	#self.trial_num += 1

	
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}, uQ = {}, location = {}, destination = {}, tno = {}".format(deadline, inputs, action, reward, uQ, location,destination, 1./self.tno)  # [debug]