예제 #1
0
파일: QLearn.py 프로젝트: mic0331/smartcab
class QLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world by using Q-Learning"""

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.possible_actions= Environment.valid_actions
        self.ai = QLearn(epsilon=.05, alpha=0.1, gamma=0.9)        

    def reset(self, destination=None):
        self.planner.route_to(destination)      

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self) 
        inputs = inputs.items()
        deadline = self.env.get_deadline(self)

        self.state = (inputs[0],inputs[1],inputs[3],self.next_waypoint)
        
        action = self.ai.Q_move(self.state)
        
        # Execute action and get reward
        reward = self.env.act(self, action)

        inputs2 = self.env.sense(self) 
        inputs2 = inputs2.items()
        next_state = (inputs2[0],inputs2[1],inputs2[3],self.next_waypoint)

        self.ai.Q_post(self.state, action, next_state, reward)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #2
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        
        # TODO: Select action according to your policy
        action = None

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        print("LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward))  # [debug]
예제 #3
0
class QLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world by using Q-Learning"""

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint
        self.learner = QLearn(epsilon=1, alpha=0.3, gamma=0.6)        

    def reset(self, destination=None):
        self.planner.route_to(destination)      

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self) 
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # TODO: Select action according to your policy
        action = self.learner.action(self.state)
        
        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        next_inputs = self.env.sense(self) 
        self.future_next_way_out = self.planner.next_waypoint()
        next_state = (next_inputs['light'], next_inputs['oncoming'], next_inputs['left'], self.future_next_way_out)
        #next_state = (next_inputs[0], next_inputs[1], next_inputs[3], self.planner.next_waypoint())
        self.learner.learn(self.state, action, next_state, reward)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #4
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.q_matrix = {}
        self.alpha = 0.95
        self.gamma = 0.05
        self.explore = 0.05
        self.state = None

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        deadline = self.env.get_deadline(self)

        state = self.observe_state()
        self.state = state
        
        knowledge = self.knowledge_from_state(state)

        # TODO: Select action according to your policy
        if random() < self.explore:
            action = choice(OPTIONS)
        else:
            action = sorted(knowledge.items(), key=lambda x: x[1], reverse=True)[0][0]

        old_value = knowledge[action]

        # Execute action and get reward
        reward = self.env.act(self, action)
        new_state = self.observe_state()
        new_state_knowledge = self.knowledge_from_state(new_state)

        self.q_matrix[state][action] = (1-self.alpha) * old_value +\
                                       self.alpha * (reward + self.gamma * max(new_state_knowledge.values()))
        

        #print "LearningAgent.update(): deadline = {}, state = {}, action = {}, reward = {}".format(deadline, state, action, reward)  # [debug]

    def knowledge_from_state(self, state):
        """
        Gets what we know about the given state
        """
        if not state in self.q_matrix:
            self.q_matrix[state] = {option: 0 for option in OPTIONS}
        return self.q_matrix[state]

    def observe_state(self):
        """
        Observe the current state
        """
        inputs = self.env.sense(self)
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        return (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)
예제 #5
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        # location, heading, delta, deadline, oncoming, traffic light state
        inputs['waypoint'] = self.next_waypoint

        self.state = tuple(sorted(inputs.items())) # performs the actual update

        # TODO: Select action according to your policy
        actions = [None,'forward','left','right']
        action = actions[random.randint(0,3)]

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
class BasicAgent(Agent):
    def __init__(self, env):
        super(BasicAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.color = 'red'  # override color

    def reset(self, destination=None):
        self.planner.route_to(destination)  
        
    def update(self, t):
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        action = random.choice(self.env.valid_actions)
        reward = self.env.act(self, action)
        
        light = inputs['light']
        oncoming = inputs['oncoming']
        left = inputs['left']
        right = inputs['right']
       
        #Update the current observed state
        self.state = (light, oncoming, left, right, self.next_waypoint)
        
        print "Basic.update(): next move = {}".format(action)  # [debug]
예제 #7
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.alpha = 0.2
        self.Qtable = {}
        

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.reward = 0
        
        
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state

        # Inform Driving Agent
        self.state = (inputs['light'], inputs['oncoming'], 
                      inputs['left'], inputs['right'], self.next_waypoint)
        
        
        # TODO: Select action according to your policy
        valid_actions = [None, 'forward', 'left', 'right']
        
        # Random Pick Actions
        ### action = random.choice(valid_actions)
        
        # Get all Q values for all state
        all_actions = {action: self.Qtable.get((self.state, action), 0)
                        for action in valid_actions}
                           
        # Find action that maximizes Q values
        best_actions = [action for action in valid_actions 
                        if all_actions[action] == max(all_actions.values())]
           
        # Return Best Action
        action = random.choice(best_actions)    
         
        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        # To simplify the problem, I set gamma equals 0.
        self.Qtable[(self.state, action)] = \
        (1 - self.alpha) * self.Qtable.get((self.state, action), 0) + self.alpha * reward
     
           
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #8
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here



    def reset(self, destination=None):
        self.planner.route_to(destination)



    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = inputs['location']
        self.heading=inputs['heading']

        action = random.choice(Environment.valid_actions)
        reward = self.env.act(self, action)
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs,
                                                                                                    action,
                                                                                                    reward)  # [debug]
예제 #9
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        #Actions
        self.A = ['forward', 'left', 'right', None] #all the actions available
        self.trial = 0

        #Initialize Q table(light, oncoming, next_waypoint)
        self.Q={}
        for i in ['green', 'red']:  #possible lights
            for j in [None, 'forward', 'left', 'right']: #possible oncoming_agent
                for k in ['forward', 'left', 'right']: #possible next_waypoints
                    self.Q[(i,j,k)] = [1] * len(self.A)


    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (inputs['light'], inputs['oncoming'], self.next_waypoint)
        #Find the max Q value for the current state
        max_Q = self.Q[self.state].index(max(self.Q[self.state]))

        #assign action
        p = random.randrange(0,5)
        epsilon = 1.5 # small probability to act randomly
        if p<epsilon:
            action = random.choice(self.env.valid_actions)
        else:
            action = self.A[max_Q]
        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        gamma = 0.30
        alp_tune =500 # tunning parameter
        alpha = 1/(1.1+self.trial/alp_tune) # decay learning rate
        self.trial = self.trial+1

        #get the next action state Q(s',a')
        next_inputs = self.env.sense(self)
        next_next_waypoint = self.planner.next_waypoint()
        next_state = (next_inputs['light'],next_inputs['oncoming'], next_next_waypoint)

        #Update Q Table
        self.Q[self.state][self.A.index(action)] = \
        (1-alpha)*self.Q[self.state][self.A.index(action)] + \
        (alpha * (reward + gamma * max(self.Q[next_state])))
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, learning_rate=0.6, discount_factor=0.35):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.alpha = learning_rate
        self.gamma = discount_factor
        self.state = None
        self.Q_values = self.initialize_Q_values()
 
    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
       
    def initialize_Q_values(self, val=10.0):
        Q_values = {}
        for waypoint in ['left', 'right', 'forward']:
            for light in ['red', 'green']:
                for action in self.env.valid_actions:
                    Q_values[((waypoint, light), action)] = val
        return Q_values

    def find_max_Q(self, state):
        action = None
        max_Q = 0.0
        for a in self.env.valid_actions:
            Q = self.Q_values[(state, a)]
            if Q > max_Q:
                action = a
                max_Q = Q
        return (max_Q, action)

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (self.next_waypoint, inputs['light'])

        # TODO: Select action according to your policy
        #action = random.choice(self.env.valid_actions) 
        (Q, action) = self.find_max_Q(self.state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        next_waypoint = self.planner.next_waypoint()
        inputs = self.env.sense(self)
        state_prime = (next_waypoint, inputs['light'])
        (Q_prime, action_prime) = self.find_max_Q(state_prime)        
        Q += self.alpha * (reward + self.gamma*Q_prime - Q) 
        self.Q_values[(self.state, action)] = Q

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #11
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        # sets self.env = env, state = None, next_waypoint = None, and a default color
        super(LearningAgent, self).__init__(env)
        self.color = 'red'  # override color
        # simple route planner to get next_waypoint
        self.planner = RoutePlanner(self.env, self)
        # TODO: Initialize any additional variables here

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        # # from route planner, also displayed by simulator
        self.next_waypoint = self.planner.next_waypoint()
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = self.next_waypoint, inputs['light']
        # self.state = self.next_waypoint, (inputs['oncoming'] == None or inputs['light'] == 'green')

        # TODO: Select action according to your policy
        # Define actions
        actions = [None, 'forward', 'left', 'right']

        # QUESTION 1- select random action
        action = random.choice(actions)

        # Execute action and get reward
        reward = self.env.act(self, action)
class LearningAgent(Agent):
    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.next_waypoint = None
        self.total_reward = 0

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.next_waypoint = None

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Select action according to your policy
        loc, hed = self.get_my_location()
        action = self.get_next_waypoint_given_location( loc, hed)
        action_okay = self.check_if_action_is_ok(inputs)
        if not action_okay:
            action = None
        # Execute action and get reward
        reward = self.env.act(self, action)
        self.total_reward += reward

        #TODO: Learn policy based on state, action, reward
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint

        valid_waypoints = ['forward','right','left']
        valid_lights = ['red','green']
        valid_actions = ['forward','right','left', None]

        self.Q_hat={}
        states = [(next_waypoint,light,oncoming,right,left) for next_waypoint in valid_waypoints for light in valid_lights for oncoming in valid_actions for right in valid_actions for left in valid_actions]
        states.append('Destination')
        for state in states:
            self.Q_hat[state] = {}
            for action in valid_actions:
                self.Q_hat[state][action] = random.random()

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        if self.next_waypoint == None:
            self.state = 'Destination'
        else:
            self.state = (self.next_waypoint, inputs['light'],inputs['oncoming'],inputs['right'],inputs['left'])

        # TODO: Select action according to your policy

        valid_actions = ['forward','right','left', None]
        epsilon = 0.1 #0.01

        best_action = max(self.Q_hat[self.state],key=self.Q_hat[self.state].get)
        random_action = choice(valid_actions)
        action = choice([best_action, random_action],p=[1-epsilon,epsilon])

        # Execute action and get reward

        reward = self.env.act(self, action)

        # Learn policy based on state, action, reward

        new_next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        new_inputs = self.env.sense(self)
        new_state = (new_next_waypoint, new_inputs['light'],new_inputs['oncoming'],new_inputs['right'],new_inputs['left'])
        alpha = 0.5 #opt 0.7
        gamma = 0.5 #opt 0.1
        max_Qhat_ahat = max(self.Q_hat[new_state].values())
        self.Q_hat[self.state][action] = (1-alpha)*self.Q_hat[self.state][action]+alpha*(reward+gamma*max_Qhat_ahat)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #14
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.prevReward = 0
        self.prevAction = None

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.prevReward = 0
        self.prevAction = None

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        state = None
        validActions = []
        
        if inputs['light'] == 'red':
            if inputs['oncoming'] != 'left':
                validActions = ['right', None]
        else:
            if inputs['oncoming'] == 'forward':
                validActions = [ 'forward','right']
            else:
                validActions = ['right','forward', 'left']

        # TODO: Select action according to your policy
        if validActions != []:
            action = random.choice(validActions)
        else:
            action = random.choice(Environment.valid_actions)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        if action != None:
            if reward > self.prevReward:
                self.state = (inputs,self.next_waypoint,deadline) 
                self.prevAction = action
                self.prevReward = reward
            else:
                self.state = (inputs,self.next_waypoint,deadline)
                self.prevAction = action
                self.prevReward = reward


        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #15
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        # sets self.env = env, state = None, next_waypoint = None, and a default color
        super(LearningAgent, self).__init__(env)
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.qtable = defaultdict(dict)
        self.epsilon = .4  # Randomness factor
        self.alpha = .4  # Learning rate.

    def reset(self, destination=None):
        # Prepare for a new trip; reset any variables here, if required
        self.planner.route_to(destination)

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # Update state
        self.state = self.define_state(inputs, self.next_waypoint)

        # Select action according policy
        action = self.select_action()

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Learn policy based on state, action, reward
        self.update_qtable(action, reward)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs,
                                                                                                    action,
                                                                                                    reward)  # [debug]

    def select_action(self):
        if random.random() < self.epsilon or self.state not in self.qtable:
            action = random.choice(self.env.valid_actions)
        else:
            action = max(self.qtable[self.state], key=self.qtable[self.state].get)

        if self.state not in self.qtable:
            for a in self.env.valid_actions:  # init action:reward dict.
                self.qtable[self.state][a] = 0
        return action

    def update_qtable(self, action, reward):
        old_q = self.qtable[self.state].get(action, 0)
        self.qtable[self.state][action] = ((1 - self.alpha) * old_q) + (self.alpha * reward)
        # Decay alpha and epsilon until it reaches 0.1 and 0.0, respectively.
        self.alpha = max(0.001, self.alpha - 0.002)
        self.epsilon = max(0.0, self.epsilon - 0.002)

    @staticmethod
    def define_state(inputs, next_waypoint):
        return tuple(inputs.values()) + (next_waypoint, )
예제 #16
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Qtable = {}
        self.alpha = 0.8
        self.gamma = 0.6

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (self.next_waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right'])
        old_state = self.state
        
        # TODO: Select action according to your policy
        if (old_state, None) not in self.Qtable.keys():
            self.Qtable[(old_state, None)] = 0
        if (old_state, 'left') not in self.Qtable.keys():
            self.Qtable[(old_state, 'left')] = 0
        if (old_state, 'forward') not in self.Qtable.keys():
            self.Qtable[(old_state, 'forward')] = 0
        if (old_state, 'right') not in self.Qtable.keys():
            self.Qtable[(old_state, 'right')] = 0
        
        max_action = max(self.Qtable[(old_state, None)], self.Qtable[(old_state, 'left')],self.Qtable[(old_state, 'forward')],self.Qtable[(old_state, 'right')])
        action = random.choice([ i for i in ['left', 'forward', 'right', None] if self.Qtable[(old_state, i)] == max_action])

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        #self.next_waypoint = self.planner.next_waypoint()
        inputs = self.env.sense(self)
        self.state = (self.next_waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right'])
        new_state = self.state

        if (new_state, None) not in self.Qtable.keys():
            self.Qtable[(new_state, None)] = 0
        if (new_state, 'left') not in self.Qtable.keys():
            self.Qtable[(new_state, 'left')] = 0
        if (new_state, 'forward') not in self.Qtable.keys():
            self.Qtable[(new_state, 'forward')] = 0
        if (new_state, 'right') not in self.Qtable.keys():
            self.Qtable[(new_state, 'right')] = 0

        self.Qtable[(old_state, action)] = (1 - self.alpha) * self.Qtable[(old_state, action)] + self.alpha *(reward + self.gamma * max(self.Qtable[(new_state, i)] for i in [None, 'left', 'forward', 'right']))
예제 #17
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.actions = [None, 'forward', 'left', 'right']
        self.learning_rate = 0.3
        self.state = None
        self.q = {}
        self.trips = 0

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # Prepare for a new trip; reset any variables here, if required
        self.trips += 1
        if self.trips >= 100:
            self.learning_rate = 0

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # Update state
        state = (inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'],
self.next_waypoint)
        self.state = state

        # Lazy initialization of Q-values
        for action in self.actions:
            if (state, action) not in self.q:
                self.q[(state, action)] = 1
        
        # Select action according to the policy

        probabilities = [self.q[(state, None)], self.q[(state, 'forward')], self.q[(state, 'left')], self.q[(state, 'right')]]
        # Use the softmax funtion so that the values actually behave like probabilities
        probabilities = np.exp(probabilities) / np.sum(np.exp(probabilities), axis=0)

        adventurousness = max((100 - self.trips) / 100, 0)
        if random.random() < adventurousness:
            action = np.random.choice(self.actions, p=probabilities)
        else:
            action = self.actions[np.argmax(probabilities)]

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Learn policy based on state, action, reward
        self.q[(state, action)] = self.learning_rate * reward + (1 - self.learning_rate) * self.q[(state, action)]

        #print("LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward))  # [debug]

    def get_state(self):
        return self.state
예제 #18
0
class BasicAgent(Agent):
    """A basic agent upon which to build learning agents."""

    def __init__(self, env):
        super(BasicAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.qvals = {} # mapping (state, action) to q-values
        self.time = 0 # number of moves performed
        self.possible_actions = (None, 'forward', 'left', 'right')
        self.n_dest_reached = 0 # number of destinations reached
        self.last_dest_fail = 0 # last time agent failed to reach destination
        self.sum_time_left = 0 # sum of time left upon reaching destination over all trials
        self.n_penalties = 0 # number of penalties incurred
        self.last_penalty = 0 # last trial in which the agent incurred in a penalty

    def reset(self, destination=None):
        self.planner.route_to(destination)

    def best_action(self, state):
        """
        Return a random action (other agents will have different policies)
        """
        return random.choice(self.possible_actions)

    def update_qvals(self, state, action, reward):
        """
        Keeps track of visited (state, action) pairs. 
        (in other agents will use reward to update 
        the mapping from (state, action) pairs to q-values)
        """
        self.qvals[(state, action)] = 0
    
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        
        # update time
        self.time += 1

        # Update state
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'],
                      self.next_waypoint)

        # Pick an action
        action = self.best_action(self.state)
        
        # Execute action and get reward
        reward = self.env.act(self, action)
        if reward < 0:
            self.n_penalties += 1
        
        # Update the q-value of the (state, action) pair
        self.update_qvals(self.state, action, reward)
        
        # print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #19
0
파일: q_agent.py 프로젝트: xkeecs/smartCab
class QLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.total_reward = 0
        self.next_waypoint = None
        self.QLearner = QAlgorithm(epsilon=0.03, alpha = 0.1, gamma = 0.9)
        self.next_state = None
        self.time_step = 1.0
    
    def get_decay_rate(self, t): #Decay rate for epsilon
       return 1.0 / float(t)
       
    def reset(self, destination=None):
        self.planner.route_to(destination)
        self.time_step = 1.0
        # TODO: Prepare for a new trip; reset any variables here, if required
        '''
        self.previous_action = None
        self.next_state = None
        self.total_reward = 0
        self.next_waypoint = None
        self.QLearner = QAlgorithm(epsilon=0.03, alpha = 0.1, gamma = 0.9)
        self.next_state = None
        '''
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        #env_states = self.env.agent_states[self]
        self.time_step += 1
        epsilon = self.get_decay_rate(self.time_step)
        self.QLearner.set_eps(epsilon)
        
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)
        #self.state = (inputs['light'], self.next_waypoint)

        action = self.QLearner.q_move(self.state)        
        
        # Execute action and get reward
        reward = self.env.act(self, action)
        self.total_reward += reward
        
        new_inputs = self.env.sense(self)
        #env_states = self.env.agent_states[self]
        new_deadline = self.env.get_deadline(self)
        self.next_state =  (new_inputs['light'], new_inputs['oncoming'], new_inputs['left'], self.next_waypoint)
        #self.next_state =  (new_inputs['light'], self.next_waypoint)
        
        # TODO: Learn policy based on state, action, reward

        self.QLearner.q_future_reward(self.state, action, self.next_state, reward)
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #20
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, alpha, q_initial):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.policy = {}
        self.trip_log = []
        self.trip = None
        self.alpha = alpha
        self.q_initial = q_initial

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        if self.trip:
            self.trip_log.append(self.trip)
        self.trip = {}
        self.trip['Deadline'] = self.env.get_deadline(self)
        self.trip['Reward'] = 0
        self.trip['Penalty'] = 0

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        inputs.pop('right')
        self.state = inputs
        self.state['waypoint'] = self.next_waypoint

        # TODO: Select action according to your policy
        state_hash = hashabledict(frozenset(self.state.iteritems()))
        q = self.policy.get(state_hash, {
            None: self.q_initial,
            'forward': self.q_initial,
            'left': self.q_initial,
            'right': self.q_initial,
        })
        action = max(q.iteritems(), key=operator.itemgetter(1))[0]

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Update trip stats
        self.trip['Reward'] += reward
        self.trip['Remaining'] = self.env.get_deadline(self)
        self.trip['Success'] = self.planner.next_waypoint() == None
        if reward < 0: self.trip['Penalty'] += reward

        # TODO: Learn policy based on state, action, reward
        q[action] = (1 - self.alpha) * q[action] + self.alpha * reward
        self.policy[state_hash] = q
예제 #21
0
class BasicLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(BasicLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.last_reward = 0
        self.last_action = None
        self.last_state = None
        self.state = None
        self.total_reward = 0
        self.deadline = self.env.get_deadline(self)
        self.actions = ['forward', 'left', 'right', None]
        self.reached_destination = 0
        self.penalties = 0
        self.movements = 0

    def tuple_state(self, state):
        State = namedtuple("State", ["light", "next_waypoint"])
        return State(light = state['light'], next_waypoint = self.planner.next_waypoint()) 

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.last_reward = 0
        self.last_action = None
        self.last_state = None
        self.state = None
        self.total_reward = 0

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        
        # TODO: Update state
        self.state = self.tuple_state(inputs)

        # TODO: Select action according to your policy
        action = random.choice(self.actions)

        # Execute action and get reward
        reward = self.env.act(self, action)

        if reward >= 10: self.reached_destination += 1
        if reward < 0: self.penalties += 1
        self.movements += 1

        # TODO: Learn policy based on state, action, reward 
        self.last_action = action
        self.last_state = self.state
        self.last_reward = reward
        self.total_reward += reward
예제 #22
0
class QLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Q = {}
        self.learning_rate = 0.9
        self.states = None
        self.possible_actions = [None, 'forward', 'left', 'right']
        

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)



        # TODO: Update state
        # state = (self.next_waypoint,inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'],deadline)
        state = (self.next_waypoint,inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'])
        # state = (self.next_waypoint,inputs['light'])
        self.state = state

        for action in self.possible_actions:
            if(state, action) not in self.Q: 
                self.Q[(state, action)] = 20

        # TODO: Select action according to your policy
        Q_best_value = [self.Q[(state, None)], self.Q[(state, 'forward')], self.Q[(state, 'left')], self.Q[(state, 'right')]]
        # Softmax
        Q_best_value = np.exp(Q_best_value) / np.sum(np.exp(Q_best_value), axis=0)
        
        #action = np.random.choice(self.possible_actions, p=probabilities)
        action = self.possible_actions[np.argmax(Q_best_value)]

        # Execute action and get reward
        reward = self.env.act(self, action)
        if reward == -10.0 :
            print "Invalid move"
        if reward == -0.5 :
            print "valid, but is it correct?"
        

        # TODO: Learn policy based on state, action, reward
        # self.Q[(state,action)] = reward + self.learning_rate * self.Q[(state,action)]
        self.Q[(state,action)] = reward * self.learning_rate + ( 1 - self.learning_rate ) * self.Q[(state,action)]
예제 #23
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        
        # TODO: Initialize any additional variables here
        self.acts = ['left', 'right', 'forward', 'stay']
        df_tmp = pd.DataFrame(0*np.ones((2, 4)), index=['red', 'green'], columns=self.acts)
        self.Q = df_tmp
        self.deadline = 0
        

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        self.deadline = deadline

        # TODO: Update state
        self.state = inputs['light'] # in this case the state is simply defined by the color oof the traffic light at the intersection
        
        # TODO: Select action according to your policy
        # Q-Learner
        act_opt = self.Q.ix[self.state].argmax()
        if act_opt=='stay': act_opt=None
#            print "Current state = %s, qlearner action = %s" % (self.state, act_opt)
        action = act_opt

        # Execute action and get reward
        reward = self.env.act(self, action)


        # TODO: Learn policy based on state, action, reward

        # Maximize Q conditional on the state we are in = S
        q_S = self.Q.ix[self.state] # Q conditional on state S
        q_S_mx = q_S[q_S==q_S.max()] # just in case there are multiple values and the action that maximizes Q(S, a) is not unique
        ix_mx = np.random.randint(0,len(q_S_mx)) # pick one of the equally qlearner actions
        q_S_mx = q_S_mx.iloc[ix_mx]

        # Update Q
        act_tmp = action if action !=None else 'stay'
        global alpha, alpha0, gamma
        alpha *= alpha0
        self.Q.ix[self.state, act_tmp] = \
            (1-alpha)*self.Q.ix[self.state, act_tmp] + \
            alpha*(reward + gamma*q_S_mx) # note: we update the action that has been actually taken, but we update the contribution to Q(S, a) by the action that could have been taken had we behaved qlearnerly
예제 #24
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.QLearning = QLearning(epsilon=0.3, alpha=0.95, gamma=0.9)
        # self.QLearning = pickle.load(open('./QLearning.pkl'))

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def is_action_okay(self, inputs):
        action_okay = True
        if self.next_waypoint == 'right':
            if inputs['light'] == 'red' and inputs['left'] == 'forward':
                action_okay = False
        elif self.next_waypoint == 'forward':
            if inputs['light'] == 'red':
                action_okay = False
        elif self.next_waypoint == 'left':
            if inputs['light'] == 'red' or (inputs['oncoming'] == 'forward' or inputs['oncoming'] == 'right'):
                action_okay = False

        return action_okay

    def update(self, t):
        # Gather states
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        action_okay = self.is_action_okay(inputs)
        self.state = (action_okay, self.next_waypoint, deadline < 3)
        # self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # TODO: Select action according to your policy
        action = self.QLearning.policy(self.state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Gather next states
        next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        next_inputs = self.env.sense(self)
        next_deadline = self.env.get_deadline(self)
        next_action_okay = self.is_action_okay(next_inputs)
        next_state = (next_action_okay, next_waypoint, next_deadline < 3)
        # next_state = (next_inputs['light'], next_inputs['oncoming'], next_inputs['left'], next_waypoint)

        # TODO: Learn policy based on state, action, reward
        self.QLearning.update_QTable(self.state, action, reward, next_state)
예제 #25
0
파일: agent.py 프로젝트: llathrop/smartcab
class RandomAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(RandomAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.availableAction = ['forward', 'left', 'right',None]   
        self.goal=0
        self.steps=0
        self.features=[]
        self.deadline=[]
        self.total_reward=[0]

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        #print"RESET, Final state:\n", self.state
        try:
            if self.deadline[len(self.features)-1] >0: #deadline less than zero
                self.goal+=1 #FIXME - order
                print "PASS! {} steps to goal,Goal reached {} times out of {}!".format(
                                                        self.deadline[len(self.features)-1],self.goal,len(self.features))
            else:
                print "FAIL! {} steps to goal,Goal reached {} times out of {}!".format(
                                                        self.deadline[len(self.features)-1],self.goal,len(self.features))
                pass
        except:
            print "Trial 0 - Goal reached {} times out of {}!".format(self.goal,len(self.features))
            pass
        print "----------------------------------------------------------"
        self.features.append({})
        self.deadline.append(None)
        self.total_reward.append(0)
        self.steps=0

    def update(self, t):
        # Gather inputs
        self.steps+=1
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        #self.deadline[len(self.features)] = self.env.get_deadline(self)
        self.state=inputs
        self.features[len(self.features)-1][self.steps]=inputs
        self.deadline[len(self.deadline)-1] = self.env.get_deadline(self)

        # TODO: Select action according to your policy
        action = self.availableAction[random.randint(0,3)]    
        
        # Execute action and get reward
        reward = self.env.act(self, action)
        self.lastReward=reward
        # TODO: Learn policy based on state, action, reward
        self.total_reward[len(self.total_reward)-1] =self.total_reward[len(self.total_reward)-1]+reward
예제 #26
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, learning_rate=0.78, discount_factor=0.36):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Q = {}
        self.discount_factor = discount_factor
        self.learning_rate = learning_rate
        self.actions = Environment.valid_actions
        self.oldAction = None


    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.oldAction=None
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        #------------------------------------------------
        #action = np.random.choice(self.actions)
        #------------------------------------------------
        


        # TODO: Select action according to your policy
        current_state = tuple(inputs.values() + [self.next_waypoint])
        self.state = (inputs, self.next_waypoint)
        
        if current_state not in self.Q:
        	self.Q[current_state] = [3,3,3,3]

        Q_max = self.Q[current_state].index(np.max(self.Q[current_state]))
        # Execute action and get reward
        action = self.actions[Q_max]
        reward = self.env.act(self, action)



        # TODO: Learn policy based on state, action, reward
        
        if t!=0:
			oldvalue = self.Q[self.oldAction[0]][self.oldAction[1]]
			newvalue = ((1 - self.learning_rate)*oldvalue)+(self.learning_rate * (self.oldAction[2] + self.discount_factor * self.Q[current_state][Q_max]))
			self.Q[self.oldAction[0]][self.oldAction[1]] = newvalue 
        
        self.oldAction = (current_state, self.actions.index(action), reward)
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #27
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.debugOn = False
        self.reporter = Reporter(driver_type, self.env.valid_actions)
        if driver_type=='random':
            self.driver = RandomDriver(self.env.valid_actions)
        elif driver_type=='greedy':
            self.driver = GreedyDriver(self.env.valid_actions)
        elif driver_type=='greedy2':
            self.driver = Greedy2Driver(self.env.valid_actions)
        elif driver_type=='smart':
            self.driver = SmartDriver(self.env.valid_actions, self.reporter)
        elif driver_type=='smart2':
            self.driver = Smart2Driver(self.env.valid_actions, self.reporter)
        elif driver_type=='smart3':
            self.driver = Smart3Driver(self.env.valid_actions, self.reporter)
        else:
            self.driver = Smart2Driver(self.env.valid_actions, self.reporter)

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.reporter.reset(destination)

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = State.make(self.next_waypoint, inputs)
        
        # TODO: Select action according to your policy
        action = self.driver.decide(self.state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        newWaypoint = self.planner.next_waypoint()
        newState = State.make(newWaypoint, self.env.sense(self))
        self.driver.learn(t, deadline, self.state, action, reward, newState)

        location = self.env.agent_states[self]['location']
        if self.debugOn:
            print "LearningAgent.update(): deadline={} inputs={} action={} reward={} location={}".format(deadline, inputs, action, reward, location)  # [debug]
        self.reporter.update(t, deadline, self.state, action, reward, location, self.driver)
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, trials):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint

        # TODO: Initialize any additional variables here
        self.trials = trials
        self.success = 0  # How many times agent able to reach the target for given trials?

        self.penalties = 0
        self.total_rewards = 0

    def reset(self, destination=None):
        self.planner.route_to(destination)

        # TODO: Prepare for a new trip; reset any variables here, if required
        self.penalties = 0
        self.total_rewards = 0

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        # self.state = inputs
        # self.state = (inputs['light'], inputs['oncoming'], inputs['left'])
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # TODO: Select action according to your policy
        action = random.choice(self.env.valid_actions)  # Initial random movement / random action

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".\
            format(deadline, inputs, action, reward)  # [debug]

        # Statistics
        self.total_rewards += reward

        if reward < 0:
            self.penalties += 1

        if reward > 5:
            self.success += 1
            print "\nTrial info: Number of Penalties =  {}, Net rewards: {}. \nTotal success/trials: {}/{}. \n".\
                format(self.penalties, self.total_rewards, self.success, self.trials)
예제 #29
0
파일: agent.py 프로젝트: qmaruf/smartcab
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here        
        self.actions = ['forward', 'left', 'right',None]        
        self.q_table = Counter()        
        self.alpha = 0.5
        self.gamma = 0.2


    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def get_best_action(self, state):
        best_profit = -999
        best_action = None
        for a in self.actions:
            profit = self.q_table[(state, a)]            
            if profit > best_profit:
                best_profit = profit
                best_action = a
        return best_action, best_profit


    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state        
        self.state = (inputs['light'],inputs['oncoming'], inputs['left'], self.next_waypoint)       

        if randint(0,10) < 3:
            action = self.actions[randint(0, 3)]
        else:        
            action, _ = self.get_best_action(self.state)
        
        reward = self.env.act(self, action)

        next_inputs = self.env.sense(self)
        next_next_waypoint = self.planner.next_waypoint()
        next_state = (next_inputs['light'],next_inputs['oncoming'], next_inputs['left'], next_next_waypoint)      
       
        _, best_profit = self.get_best_action(next_state)
        self.q_table[(self.state, action)] = (1-self.alpha)*self.q_table[(self.state, action)] + (self.alpha * (reward + self.gamma * best_profit))

       
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #30
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.total_reward = 0

        #Q matrix
        self.Q = {}

        # initial Q values
        self.initial_q = 30

        # Initialize state
        self.states = None

        # Dictionary of possible actions to index states from Q matrix
        self.actions = {None : 0, 'forward' : 1, 'left' : 2, 'right' : 3}

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # Check if this state is included in the Q matrix. If not, create a new row in the matrix.
        if self.state not in self.Q:
            self.Q[self.state] = [self.initial_q for i  in range(len(self.env.valid_actions))]

        # TODO: Select action according to your policy
        learning_rate = 0.8
        action = self.env.valid_actions[np.argmax(self.Q[self.state])]
        discount = 0
        # Execute action and get reward
        reward = self.env.act(self, action)
        self.total_reward += reward
        # TODO: Learn policy based on state, action, reward

        # get the future state
        future_state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # Update the Q matrix (add consideration of future reward)
        #self.Q[self.state][self.actions[action]] = self.Q[self.state][self.actions[action]] * (1 - learning_rate) + reward * learning_rate
        self.Q[self.state][self.actions[action]] = self.Q[self.state][self.actions[action]] + learning_rate * (reward + discount * self.Q[future_state][self.actions[action]] - self.Q[self.state][self.actions[action]])
예제 #31
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor
        # Set any additional class parameters as needed
        self.trial = 1

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
           'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0

        if testing:
            self.epsilon = 0
            self.alpha = 0

        else:
            #self.epsilon -= 0.05  ##To be used in default learner
            self.epsilon = -math.pow(-0.03 * self.trial, 2) + 1
            self.alpha = -math.pow(-0.02 * self.trial, 2) + 1

        if self.learning:
            self.trial += 1

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        state = (inputs['light'], inputs['oncoming'], waypoint)

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        maxQ = max(self.Q[state].values())

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning:
            if not state in self.Q.keys():
                state_dict = {}
                for action in self.valid_actions:
                    state_dict[action] = 0.0
                self.Q[state] = state_dict

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability

        if (self.learning and
                random.random() < self.epsilon) or state not in self.Q.keys():
            action = random.choice(self.valid_actions)

        else:
            Q_val = self.get_maxQ(state)
            for a in self.valid_actions:
                if self.Q[state][a] == Q_val:
                    action = a
                    break

        #   Otherwise, choose an action with the highest Q-value for the current state

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            current_Q = self.Q[state][action]
            self.Q[state][action] = (
                1 - self.alpha) * current_Q + self.alpha * reward

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
예제 #32
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.q_values = {}
        self.penalties = []
        self.penalty = 0
    def reset(self, destination=None):
        self.penalties.append(self.penalty)
        self.penalty = 0
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # Update state
        curr_state = (self.next_waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right'])
        self.state = curr_state

        old_q, action = self.choose_best_action(self.state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # calculate penalties
        if reward < 0:
            self.penalty += 1

        # Sense the environment, and obtain new state
        new_inputs = self.env.sense(self)
        new_state = (self.next_waypoint, new_inputs['light'], new_inputs['oncoming'], new_inputs['left'], new_inputs['right'])

        # Update the Q table
        new_q_val = self.calculate_q_val(new_state, old_q, reward)

        self.q_values[(curr_state, action)] = new_q_val

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}, waypoint = {}".format(deadline, inputs, action, reward, self.next_waypoint)  # [debug]

    def get_max_q(self, new_state):
        q = [self.get_q_value(new_state, action) for action in Environment.valid_actions]
        max_q = max(q)
        return max_q, q

    def choose_best_action(self, state):
        max_q, q_lst = self.get_max_q(state)

        # If there are multiple max_q, pick a random one
        if q_lst.count(max_q) > 1:
            max_q_lst = [i for i in range(len(Environment.valid_actions)) if q_lst[i] == max_q]
            idx = random.choice(max_q_lst)
        else:
            idx = q_lst.index(max_q)

        action = Environment.valid_actions[idx]

        return max_q, action

    def get_q_value(self, state, action):
        return self.q_values.get((state, action), 1.0)

    def calculate_q_val(self, new_state, old_q, reward):
        # learning rate
        alpha = 0.5
        # discount factor
        gamma = 0.25
        # learned value
        learned_val = reward + (gamma * self.get_max_q(new_state)[0] - old_q)

        new_q = old_q + alpha * learned_val

        return new_q
예제 #33
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        self.state = {}
        self.learning_rate = 0.6
        self.exploration_rate = 0.1
        self.exploration_degradation_rate = 0.001
        self.discount_rate = 0.4
        self.q_values = {}
        self.valid_actions = [None, 'forward', 'left', 'right']

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        self.state = self.build_state(inputs)

        # TODO: Select action according to your policy
        action = self.choose_action_from_policy(self.state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        self.update_q_value(self.state, action, reward)

        #print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]

    def build_state(self, inputs):
        return {
            "light": inputs["light"],
            "oncoming": inputs["oncoming"],
            "left": inputs["left"],
            "direction": self.next_waypoint
        }

    def choose_action_from_policy(self, state):
        if random.random() < self.exploration_rate:
            self.exploration_rate -= self.exploration_degradation_rate
            return random.choice(self.valid_actions)
        best_action = self.valid_actions[0]
        best_value = 0
        for action in self.valid_actions:
            cur_value = self.q_value_for(state, action)
            if cur_value > best_value:
                best_action = action
                best_value = cur_value
            elif cur_value == best_value:
                best_action = random.choice([best_action, action])
        return best_action

    def max_q_value(self, state):
        max_value = None
        for action in self.valid_actions:
            cur_value = self.q_value_for(state, action)
            if max_value is None or cur_value > max_value:
                max_value = cur_value
        return max_value

    def q_value_for(self, state, action):
        q_key = self.q_key_for(state, action)
        if q_key in self.q_values:
            return self.q_values[q_key]
        return 0

    def update_q_value(self, state, action, reward):
        q_key = self.q_key_for(state, action)
        cur_value = self.q_value_for(state, action)
        inputs = self.env.sense(self)
        self.next_waypoint = self.planner.next_waypoint()
        new_state = self.build_state(inputs)
        learned_value = reward + (self.discount_rate *
                                  self.max_q_value(new_state))
        new_q_value = cur_value + (self.learning_rate *
                                   (learned_value - cur_value))
        self.q_values[q_key] = new_q_value

    def q_key_for(self, state, action):
        return "{}|{}|{}|{}|{}".format(state["light"], state["direction"],
                                       state["oncoming"], state["left"],
                                       action)
예제 #34
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=0.99, alpha=0.8):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        # self.epsilon=self.epsilon*0.99
        self.epsilon = self.epsilon * 0.99
        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        state = (waypoint, inputs['light'], inputs['left'], inputs['right'],
                 inputs['oncoming'])

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        dic = self.Q[state]
        maxQ = -100000.0
        for key in dic:
            if dic[key] > maxQ:
                maxQ = dic[key]
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        dic = self.Q
        if dic.has_key(state) == False:
            dic[state] = {
                self.valid_actions[0]: 0.0,
                self.valid_actions[1]: 0.0,
                self.valid_actions[2]: 0.0,
                self.valid_actions[3]: 0.0
            }
        self.Q = dic

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        if self.learning == False:

            i = random.randint(0, 3)
            action = self.valid_actions[i]
        elif self.learning == True:
            j = random.random()
            if j < self.epsilon:
                i = random.randint(0, 3)
                action = self.valid_actions[i]
            else:
                dic = self.Q[state]
                action = None
                l1 = list()
                for key in dic:
                    if dic[key] == self.get_maxQ(state):
                        l1.append(key)
                m = len(l1)
                action = l1[random.randint(0, m - 1)]
        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning == True:
            self.Q[state][action] = (
                1 - self.alpha) * self.Q[state][action] + self.alpha * reward
        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
예제 #35
0
파일: agent.py 프로젝트: neilmistry/mlnd
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """ 

    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent, self).__init__(env)     # Set the agent in the evironment 
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning # Whether the agent is expected to learn
        self.Q = dict()          # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon   # Random exploration factor
        self.alpha = alpha       # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)
        
        ########### 
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing == True:
            self.epsilon = 0
            self.alpha = 0
        else:
            self.epsilon = self.epsilon - 0.002
        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint() # The next waypoint 
        inputs = self.env.sense(self)           # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ########### 
        ## TO DO ##
        ###########
        
        # NOTE : you are not allowed to engineer eatures outside of the inputs available.
        # Because the aim of this project is to teach Reinforcement Learning, we have placed 
        # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation.
        # With the hand-engineered features, this learning process gets entirely negated.
        
        # Set 'state' as a tuple of relevant data for the agent        
        state = (waypoint,inputs['light'],inputs['oncoming'],inputs['left'],inputs['right'])

        return state


    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ########### 
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        stateactions = self.Q[state]
        highq = max(stateactions.iterkeys(), key=(lambda key: stateactions[key]))
        maxQ = stateactions[highq]
        return maxQ 


    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ########### 
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning == True:
            if state not in self.Q:
                actionset = {'left': 0.0, 'right': 0.0, 'forward': 0.0, None : 0.0}
                self.Q[state] = actionset
                    
        return


    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
                    
        ########### 
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie".
        
        if self.learning == False:
            action = random.choice(self.valid_actions)
        else:
            if self.epsilon > random.random():
                action = random.choice(self.valid_actions)
            else:
                maxq = self.get_maxQ(state)
                maxqactions = []
                for action, qvalue in self.Q[state].iteritems():
                    if maxq == self.Q[state][action]:
                        maxqactions.append(action)
                action = random.choice(maxqactions)
        return action


    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives a reward. This function does not consider future rewards 
            when conducting learning. """

        ########### 
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning == True:  
            self.Q[state][action] = self.Q[state][action]*(1.0-self.alpha)+self.alpha*reward

        return


    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()          # Get current state
        self.createQ(state)                 # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action) # Receive a reward
        self.learn(state, action, reward)   # Q-learn

        return
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """ 

    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent, self).__init__(env)     # Set the agent in the evironment 
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning # Whether the agent is expected to learn
        self.Q = dict()          # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon   # Random exploration factor
        self.alpha = alpha       # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.learning = learning # Whether the agent is expected to learn
        self.Q = dict()          # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon   # Random exploration factor
        self.alpha = alpha       # Learning factor

        # Set any additional class parameters as needed
        self.t = 0
        random.seed(1177)

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)
        
        ########### 
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing:
            self.epsilon = 0.0
            self.alpha = 0.0
        else:
            # commented out testing parameters
            #self.epsilon = self.epsilon - 0.05
            self.t += 1.0
            # self.epsilon = 1.0/(self.t**2)
            # self.epsilon = 1.0/(self.t**2 + self.alpha*self.t)
            # self.epsilon = 1.0/(self.t**2 - self.alpha*self.t)
            # self.epsilon = math.fabs(math.cos(self.alpha*self.t))
            # self.epsilon = math.fabs(math.cos(self.alpha*self.t))/(self.t**2)
            # self.epsilon = math.fabs(math.cos(self.alpha*self.t))
            self.epsilon = math.exp(-self.alpha*self.t)
        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint() # The next waypoint 
        inputs = self.env.sense(self)           # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ########### 
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent        
        def xstr(s):
            if s is None:
                return 'None'
            else:
                return str(s)
        
        state = xstr(waypoint) + "_" + inputs['light'] + "_" + xstr(inputs['left']) + "_" +  xstr(inputs['oncoming'])
        if self.learning:
            self.Q[state] = self.Q.get(state, {None:0.0, 'forward':0.0, 'left':0.0, 'right':0.0})
        return state


    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ########### 
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        maxQ = -1000.0
        for action in self.Q[state]:
            if maxQ < self.Q[state][action]:
                maxQ = self.Q[state][action]
        return maxQ


    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ########### 
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning:
            self.Q[state] = self.Q.get(state, {None:0.0, 'forward':0.0, 'left':0.0, 'right':0.0})

        return


    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ########### 
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state
        if not self.learning:
            action = random.choice(self.valid_actions)
        else:
            if self.epsilon > 0.01 and self.epsilon > random.random():
                action = random.choice(self.valid_actions)
            else:
                valid_actions = []
                maxQ = self.get_maxQ(state)
                for act in self.Q[state]:
                    if maxQ == self.Q[state][act]:
                        valid_actions.append(act)
                action = random.choice(valid_actions)
        return action


    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ########### 
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            self.Q[state][action] = self.Q[state][action] + self.alpha*(reward-self.Q[state][action])

        return


    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()          # Get current state
        self.createQ(state)                 # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action) # Receive a reward
        self.learn(state, action, reward)   # Q-learn

        return
예제 #37
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.times = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0

        if testing:
            self.epsilon = 0
            self.alpha = 0
        else:
            self.times = self.times + 1
            #self.epsilon = pow(0.996,self.times)
            #self.epsilon = (self.epsilon - 0.05) if (self.epsilon - 0.05) > 0 else 0
            self.epsilon = math.cos(0.0015 * self.times)
        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        state = (waypoint, inputs['light'], inputs['left'], inputs['right'],
                 inputs['oncoming'])

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        maxQ_value = max(self.Q[state].values())
        same_maxQ = []

        ###extract the same value of maxQ into a list
        for key, value in self.Q[state].items():
            if value == maxQ_value:
                same_maxQ.append(key)

        #choose an action randomly from the waiting list
        #maxQ = random.sample(same_maxQ, 1)[0]
        maxQ = random.choice(same_maxQ)

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        #if state not in self.Q:
        #    self.Q[state] ={'left':0.0, 'right':0.0, 'forward':0.0, None:0.0}
        if self.learning:
            self.Q.setdefault(state,
                              {action: 0.0
                               for action in self.valid_actions})

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state
        if self.learning == False:
            action = random.sample(self.valid_actions, 1)[0]
        elif random.random() < self.epsilon:
            action = random.sample(self.valid_actions, 1)[0]
        else:
            action = self.get_maxQ(self.state)

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        self.Q[state][action] = (
            1 - self.alpha) * self.Q[state][action] + self.alpha * reward

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
예제 #38
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the environment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        self.trial_count = 0  # Use as input for epsilon decay functions

        self.overall_states = 384  # Size of Q-Table = States Combination of all features = 2 * 4 * 4 * 4 * 3
        self.overall_state_action = 384 * 4  # Combination of all possible state + valid actions = 1536
        self.state_action_count = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice

        # Linear: a = 0.05, 20 training trials
        def decay_linear(a):
            self.epsilon -= a

        # Polynomial: a = 20, 20 training trials
        def decay_frac(a):
            self.epsilon = 1.0 / a**2

        # Exponential: a = 0.995, 600 training trials
        def decay_exponential(a):
            self.epsilon = a**self.trial_count

        def decay_exponential_e(a):
            self.epsilon = math.e**(-a * self.trial_count)

        def decay_cosine(a):
            self.epsilon = math.cos(a * self.trial_count)

        # Hold epsilon at 1 during all training trials (in order to explore state-action combination more efficiently)
        def decay_step(total_trials):
            if self.trial_count > total_trials:
                self.epsilon = 0.0

        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0,
        # make sure retrieve from Q-Table instead of random walk.
        if testing:
            self.epsilon = 0.0
            self.alpha = 0.0
        else:
            self.trial_count += 1
            decay_exponential(0.995)
            # decay_linear(0.05)
            # decay_step(600)

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        # 选择5个基础特征,state是这5个特征所有状态的组合,应该一共有384种。(waypoint只会有3种状态)
        state = (inputs['light'], inputs['oncoming'], inputs['left'],
                 inputs['right'], waypoint)

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        maxQ = max(self.Q[state].values())

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning:
            if state not in self.Q:
                self.Q[state] = {
                    None: 0.0,
                    'forward': 0.0,
                    'left': 0.0,
                    'right': 0.0
                }  # Init default Q Score.
        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        # 这里所说的按照epsilon的概率选择任意操作的实际意思是:
        # 首先,你有两个选项,opt1是任选一个操作,opt2一个是选择Q值最高的操作
        # 然后,epsilon的取值范围在[0,1]之内,并且呈现逐渐缩小的趋势,
        # 如果epsilon是0.7,那么opt1的概率就是0.7,opt2的概率就是0.3.
        if not self.learning:
            action = random.choice(
                self.valid_actions
            )  # Pick a random action from All 4 Valid Actions.
        else:
            if random.random() < self.epsilon:
                action = random.choice(
                    self.valid_actions
                )  # Pick a random action from All 4 Valid Actions.
            else:
                # action = max(self.Q[state], key=self.Q[state].get)  # Get Key with max Q Score as action.
                # 需要考虑具有多个最大值的情况,应随机抽取
                max_Q = max(self.Q[state].values())
                max_candidate = [
                    x for x in self.Q[state] if self.Q[state][x] == max_Q
                ]
                action = random.choice(max_candidate)
        """Hard Coded Driving Logic For Fun"""
        # if inputs['light'] == 'red':
        #     if self.next_waypoint == 'right' and inputs['oncoming'] != 'left' and inputs['left'] != 'forward':
        #         action = self.next_waypoint
        #     else:
        #         action = None
        # else:
        #     if self.next_waypoint == 'left' and (inputs['oncoming'] == 'forward' or inputs['oncoming'] == 'right'):
        #         action = 'forward'
        #     else:
        #         action = self.next_waypoint

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        prev_Q = self.Q[state][action]
        self.Q[state][action] = prev_Q * (1 - self.alpha) + reward * self.alpha

        # 显示当前学习进度,包括<状态>的覆盖程度以及<状态-动作>的覆盖程度。
        if prev_Q == 0 and reward != 0:
            self.state_action_count += 1
        print 'Trial Count =', self.trial_count
        print 'Q-Table Size = {} / {}'.format(len(self.Q), self.overall_states)
        print 'Q-Table Non-zero Item Count = {} / {}'.format(
            self.state_action_count, self.overall_state_action)
        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
예제 #39
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """

    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent, self).__init__(env)     # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning # Whether the agent is expected to learn
        self.Q = dict()          # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon   # Random exploration factor
        self.alpha = alpha       # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.time_step = 1


    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing == True:
            self.alpha = 0
            self.epsilon = 0
        else:
                self.alpha = 0.007
                self.epsilon = math.exp(-self.time_step * self.alpha)
                self.time_step = self.time_step + 1

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the
            environment. The next waypoint, the intersection inputs, and the deadline
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint() # The next waypoint
        inputs = self.env.sense(self)           # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left'])

        return state


    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        st = self.Q[state]
        # adapted from Lucretiel's answer here:  http://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary
        max_key = max(st, key=lambda key: st[key])
        return st[max_key]


    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        if self.learning == True:
            if state not in self.Q.keys():
                self.Q[state] = {None: 0, 'forward': 0, 'left': 0, 'right':0}
        return


    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        if (self.learning == False) or (self.epsilon > random.random()):
            return random.choice(self.valid_actions)
        else:
            max_val = self.get_maxQ(state)

            # to store actions with same max value
            acts = dict()

            for act, reward in self.Q[state].iteritems():
                if reward == max_val:
                    acts[act] = reward
            return random.choice(acts.keys())


    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning == True:
            self.Q[state][action] = (1 - self.alpha) * self.Q[state][action] + self.alpha * reward
        return


    def update(self):
        """ The update function is called when a time step is completed in the
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()          # Get current state
        self.createQ(state)                 # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action) # Receive a reward
        self.learn(state, action, reward)   # Q-learn

        return
예제 #40
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.trial = 1
        self.epsilonDecayVariable = epsilon
        self.alphaDecayVariable = alpha

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        #self.epsilon =self.epsilon -.05
        #?=e^ ?at ,for 0<a<1
        #self.epsilon =math.exp(-self.epsilonDecayVariable*self.trial*1)
        #?=cos(at),for 0<a<1
        #self.epsilon =math.cos(self.epsilonDecayVariable*self.trial)
        trialNum = self.trial
        ParmA = -5E-11
        ParmB = 0.104
        ParmC = 210

        self.epsilon = 1 * math.exp(
            -ParmA * trialNum) / (1 + math.exp(ParmB * (trialNum - ParmC)))
        #self.epsilon=self.epsilon -.05
        #self.epsilon=math.exp(-0.0005*self.trial)*(4+math.cos(.7*self.trial))/5
        #=EXP   (-lambda*A1)*(4+COS(freq*A1))/5

        #decay learning rate
        #self.alpha =.75 #math.exp(-self.alphaDecayVariable*self.trial)
        #self.epsilon =math.pow(self.epsilonDecayVariable,self.trial )

        # Update additional class parameters as needed
        #not sure on additional class parms  -time remaining, success rate,etc.. ?

        self.trial = self.trial + 1
        # If 'testing' is True, set epsilon and alpha to 0
        if testing:
            self.epsilon = 0
            self.alpha = 0

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        # When learning, check if the state is in the Q-table
        #   If it is not, create a dictionary in the Q-table for the current 'state'
        #   For each action, set the Q-value for the state-action pair to 0

        state = (inputs['light'], inputs['oncoming'], inputs['right'],
                 inputs['left'], waypoint)

        if self.learning:
            if state not in self.Q:
                #state is in Q table already
                self.Q[state] = {None: 0, 'left': 0, 'right': 0, 'forward': 0}

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########

        #get the highest Q value from the dict of actions for this state: state parm.
        maxQ = max(self.Q[state].values())
        #maxQ = None

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning:
            if state not in self.Q:
                self.Q[state] = {None: 0, 'left': 0, 'right': 0, 'forward': 0}

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        #action = random.choice(self.valid_actions)

        #print(action)
        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        #get max q value
        maxQ = self.get_maxQ(state)

        #initially choose action with highest Q Value
        dictActions = self.Q[state]
        action = dictActions.keys()[dictActions.values().index(maxQ)]

        if self.learning:
            #if the random choice is within epsilon prob, then choose randomly.
            lDblRand = random.random()
            #is the random less than epsilon ?
            if lDblRand < self.epsilon:
                action = random.choice(self.valid_actions)
        else:
            print('not learning')

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            self.Q[state][action] = (
                1 - self.alpha) * self.Q[state][action] + (reward * self.alpha)

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
예제 #41
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self,
                 env,
                 learning=False,
                 epsilon=1.0,
                 alpha=0.5,
                 epsilon_threshold=0.005,
                 a=0.5,
                 experiment="na"):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.epsilon_threshold = epsilon_threshold
        self.t = 0
        self.a = a
        self.experiment = experiment

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing:
            self.epsilon = 0
            self.alpha = 0
        else:
            if self.experiment == "2":
                self.epsilon = self.a**self.t
            elif self.experiment == "3":
                if self.t == 0:
                    self.t = 1.0
                self.epsilon = 1.0 / (self.t**2)
            elif self.experiment == "4":
                self.epsilon = math.e**-(self.a * self.t)
            elif self.experiment == "5":
                self.epsilon = math.cos(self.a * self.t)
            else:
                self.epsilon = self.epsilon - self.epsilon_threshold

            self.t = self.t + 1.0

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########

        # NOTE : you are not allowed to engineer eatures outside of the inputs available.
        # Because the aim of this project is to teach Reinforcement Learning, we have placed
        # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation.
        # With the hand-engineered features, this learning process gets entirely negated.

        # Set 'state' as a tuple of relevant data for the agent
        state = (waypoint, inputs)

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        maxQ = max(self.Q[self.key_state(state)].values())

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        if self.learning:
            if self.Q.get(self.key_state(state)) == None:
                actions_ = {}
                for a in self.valid_actions:
                    actions_[a] = 0.0
                self.Q[self.key_state(state)] = actions_

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie".
        if not self.learning:
            idx = random.randint(0, len(self.valid_actions) - 1)
            action = self.valid_actions[idx]
            return action

        if self.epsilon > 0.01 and self.epsilon > random.random():
            action = self.valid_actions[random.randint(
                0,
                len(self.valid_actions) - 1)]
        else:
            max_value = self.get_maxQ(state)
            actions_ = [
                action_
                for (action_,
                     value_) in self.Q[self.key_state(state)].iteritems()
                if value_ == max_value
            ]
            idx = random.randint(0, len(actions_) - 1)
            action = actions_[idx]

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives a reward. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')

        if self.learning:
            s_a = self.Q[self.key_state(state)][action]
            self.Q[self.key_state(
                state)][action] = (1 - self.alpha) * s_a + self.alpha * reward
        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return

    def key_state(self, state):
        """ It receives an state and returns a string that serves as
            key for the Q table """

        return "{0}_LI:{1}_O:{2}_R:{3}_L:{4}".format(state[0],
                                                     state[1]['light'],
                                                     state[1]['oncoming'],
                                                     state[1]['right'],
                                                     state[1]['left'])
예제 #42
0
class QLearningAgent(Agent):
    # An agent using Q learning learns to drive

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)
        self.color = 'red'
        self.planner = RoutePlanner(self.env, self)
        self.deadline = self.env.get_deadline(self)
        self.next_waypoint = None
        self.moves = 0

        self.qDict = dict()
        self.alpha = 0.9  # learning rate
        self.epsilon = 0.05  # probability of flipping the coin
        self.gamma = 0.2

        self.state = None
        self.new_state = None

        self.reward = None
        self.cum_reward = 0

        self.possible_actions = Environment.valid_actions
        self.action = None

    def reset(self, destination=None):
        self.planner.route_to(destination)
        self.next_waypoint = None
        self.moves = 0

        self.state = None
        self.new_state = None

        self.reward = 0
        self.cum_reward = 0

        self.action = None

    def getQvalue(self, state, action):
        key = (state, action)
        return self.qDict.get(key, 5.0)

    def getMaxQ(self, state):
        q = [self.getQvalue(state, a) for a in self.possible_actions]
        return max(q)

    def get_action(self, state):
        """
		epsilon-greedy approach to choose action given the state 
		"""
        if random.random() < self.epsilon:
            action = random.choice(self.possible_actions)
        else:
            q = [self.getQvalue(state, a) for a in self.possible_actions]
            if q.count(max(q)) > 1:
                best_actions = [
                    i for i in range(len(self.possible_actions))
                    if q[i] == max(q)
                ]
                index = random.choice(best_actions)

            else:
                index = q.index(max(q))
            action = self.possible_actions[index]

        return action

    def qlearning(self, state, action, nextState, reward):
        """
		use Qlearning algorithm to update q values
		"""
        key = (state, action)
        if (key not in self.qDict):
            # initialize the q values
            self.qDict[key] = 5.0
        else:
            self.qDict[key] = self.qDict[key] + self.alpha * (
                reward + self.gamma * self.getMaxQ(nextState) -
                self.qDict[key])

    def update(self, t):
        self.next_waypoint = self.planner.next_waypoint()
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # take the following variables as states:
        # 	- environ(oncoming, left, right)
        #	- next_waypoint
        #	- traffic light
        self.new_state = inputs
        self.new_state['next_waypoint'] = self.next_waypoint
        self.new_state = tuple(sorted(self.new_state.items()))

        # for the current state, choose an action based on epsilon policy
        action = self.get_action(self.new_state)
        # observe the reward
        new_reward = self.env.act(self, action)
        # update q value based on q learning algorithm
        if self.reward != None:
            self.qlearning(self.state, self.action, self.new_state,
                           self.reward)
        # set the state to the new state
        self.action = action
        self.state = self.new_state
        self.reward = new_reward
        self.cum_reward = self.cum_reward + new_reward
        self.moves = self.moves + 1
예제 #43
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.trial_num = 1

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        #self.epsilon = self.epsilon - 0.005
        #print "self.trial_num",[self.epsilon,self.trial_num,math.log(100.0/(self.trial_num+1)),math.log(100.0/self.trial_num)]
        if self.epsilon > 0.0:
            nr = 10000.0
            self.epsilon = self.epsilon * math.log(
                nr / (self.trial_num + 1)) / math.log(nr / self.trial_num)
        else:
            self.epsilon = 0.0
        """
        if self.alpha>0.3:
            self.alpha=self.alpha-0.05
        else:
            self.alpha=0.3
            
        """
        self.alpha = self.epsilon + 0.2
        if self.alpha > 1.0:
            self.alpha = 1.0
        self.trial_num = self.trial_num + 1
        if testing == True:
            self.epsilon = 0.0
            self.alpha = 0.0
        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########

        # NOTE : you are not allowed to engineer eatures outside of the inputs available.
        # Because the aim of this project is to teach Reinforcement Learning, we have placed
        # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation.
        # With the hand-engineered features, this learning process gets entirely negated.

        # Set 'state' as a tuple of relevant data for the agent
        #state = None
        #state={'waypoint':waypoint,'light':inputs['light'],'oncoming':inputs['oncoming'], 'left':inputs['left'] }
        state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left'],
                 inputs['right'])

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        maxQ = None
        if state in Q:
            actions_dict = Q[state]
        """
        for action, value in actions_dict.iteritems():
                if maxQ==None:
                    maxQ=(value,action)
                elif value>maxQ[0]:
                    maxQ=(value,action)
                elif value<maxQ[0]:
                    pass
                elif value==maxQ[0]:
                    maxQ.add(action)
        """
        for action, value in actions_dict.iteritems():
            if not maxQ:
                maxQ = value
            elif maxQ < value:
                maxQ = value
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if state not in self.Q:
            self.Q[state] = {
                "left": 0.0,
                "right": 0.0,
                "forward": 0.0,
                None: 0.0
            }
            self.Q[state][state[0]] = 1.0
        return self.Q

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None
        """action_id = random.randint(0,3)
        action  = self.valid_actions[action_id]
        current_input = self.env.sense(self)
        light_state = current_input['light']
        action =  self.next_waypoint
        print {'light_state':light_state,'self.next_waypoint':self.next_waypoint,'action':action, 'left':current_input['left'] , 'oncoming':current_input['oncoming'],'right':current_input['right']}
        if light_state != 'green':
            if self.next_waypoint != 'right':
                action = None
            else:
                if current_input['left'] == 'forward' or current_input['oncoming'] == 'left':
                    action = None
                else:
                    action =self.next_waypoint
        if light_state == 'green':
            if self.next_waypoint != 'left':
                action = self.next_waypoint
            elif current_input['oncoming']=='forward':
                action = None
#        action_id = random.randint(0,3)
#        action  = self.valid_actions[action_id]
#        print "action:",action,action_id
        """
        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie".
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(self.env.valid_actions)
        if not self.learning:
            return random.choice(self.env.valid_actions)

        maxQ = None
        for action, value in self.Q[state].iteritems():
            if not maxQ:
                maxQ = [value, action]
            elif maxQ[0] == value:
                maxQ.append(action)
            elif maxQ[0] < value:
                maxQ = [value, action]

        action_to_return = random.choice(maxQ[1:])
        """if action_to_return==None:
            action_to_return = None"""
        return action_to_return

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives a reward. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            self.Q[state][action] = self.alpha * reward + (
                1 - self.alpha) * self.Q[state][action]
        return True

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        #print "taking action:",action,type(action)
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
예제 #44
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Q = {}
        self.policy = {}
        self.alpha = 0.4
        self.gamma = 0.3
        self.actions = [None, 'forward', 'right', 'left']

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        # The states can be defined as what the current environment is, .i.e., the traffic light and the intersection situation +
        # the location of the next waypoint.(Hardcoding states is kind of bad idea as they are many of them)
        self.state = inputs
        self.state['next_waypoint'] = self.next_waypoint
        self.state = tuple(self.state.items())
        # select random action if state not present in the policy: Agent has not seen the state

        # TODO: Select action according to your policy
        #print type(self.policy)
        action = self.actions[random.randrange(4)] if self.state not in self.policy else self.policy[self.state]
        reward = self.env.act(self, action)
        state_prime = self.env.sense(self)
        state_prime['next_waypoint'] = self.planner.next_waypoint()
        state_prime = tuple(state_prime.items())
        
        # Now we have all the variables that are input to q learning formula: state, reward, action, and next_state:
        # What goes next? We tweak q_value for the present state action pair and then choose the action of the state which maximized q(the optimal policy)
        # According to the formula we need to calculate the utility of the next state. max(a) Q(s', a'). The policy is holding argmax(a) Q(s,a) values.
        # Thus, as the Q learning algorithm converges, we would be finding he max(a) Q(s', a') if the policy has seen that state. Thus, there is no need to
        # calculate the max(a') Q(s', a') everytime as that would be recursively going on till i don't know what
        
        action_prime  = self.actions[random.randrange(4)] if state_prime not in self.policy else self.policy[state_prime]

        # TODO: Learn policy based on state, action, reward
        # Q_learning
        state_act_pair = (self.state, action)
        self.Q[state_act_pair] = (1 - self.alpha) * self.Q.get(state_act_pair, 0) + self.alpha*(reward + self.gamma*(self.Q.get((state_prime, action_prime), 0)))
        # Now we need to update the policy well(Take action that maximizes the utility of the present state, thus updating policy)
        Q_max_candidates = [self.Q.get((self.state, act)) for act in self.actions]
        try:
            if None not in Q_max_candidates:
                # PERFORMS VERY POORLY WHEN ALL STATES NOT ALREADY SEEN.(11/100) THIS IS SO BECAUSE THE POLICY SHOULD NOT BE UPDATED 
                # UNTIL ALL THE ACTIONS CORRESPONDING TO THE STATE HAS BEEN SEEN. OR ELSE IT WILL KEEP FOLLOWING THE SAME POLICY. AND 
                # WILL NEVER FURTHER LEARN.
                # ANOTHER METHOD IS THE EXPLORATION-EXPLOITATION METHOD WHEREBY THE ACTION IN THE POLICY IS TAKEN WITH PROB 1-e AND ANOTHER
                # RANDOM ACTION WITH PROB e. 
                Q_max = max(Q_max_candidates)
                actions_candidates = [act for act in self.actions if Q_max == self.Q.get((self.state, act))]
                self.policy[self.state]= random.choice(actions_candidates)
        except Exception as e:
            print "Could not update the policy by choosing the max Q: {}".format(e)
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #45
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.state = None
        self.q_table = {}
        self.frame = 0
        self.last_state_tuple = None
        self.last_action = None
        self.alpha = .5

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.last_state_tuple = None
        self.last_action = None

    def update(self, t):

        self.frame += 1

        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  #e.g. left, forward, right
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        env_state = self.env.sense(self)
        state_tuple = (env_state['light'], self.next_waypoint)
        self.state = "Light: %s, Waypoint: %s" % state_tuple

        # TODO: Select action according to your policy
        # TODO: Learn policy based on state, action, reward

        #default to random action
        action = random.choice([None, 'forward', 'left', 'right'])

        #look for action prescribed by q table
        if state_tuple in self.q_table:

            #calculate epsilon
            epsilon = 1 - (((1. / (n_trials * 30.)) * self.frame) + .5)
            if epsilon >= 1.:
                epsilon = .99
            print "EP: ", epsilon

            #randomly decide whether to act randomly, based on epsilon
            roll = random.random()
            if roll < epsilon:
                reward = self.env.act(self, action)
                self.q_table[state_tuple].append((action, reward))
            else:
                #act according to highest recorded q value for the state
                q_temp = 0
                for i in self.q_table[state_tuple]:
                    val = i[1]
                    if val > q_temp:
                        q_temp = val
                        action = i[0]
                reward = self.env.act(self, action)

        #if action not in q table:
        else:
            reward = self.env.act(self, action)

            #q value for the state is "initialized" during the simulation (i.e. when the state/action pair is first encountered).
            #It is initialized to the immediate reward.
            self.q_table[state_tuple] = [(action, reward)]

        #update q value for last state
        if self.last_state_tuple != None:
            c = 0
            for i in self.q_table[self.last_state_tuple]:
                if i[0] == self.last_action:
                    last_act = self.q_table[self.last_state_tuple][c][0]
                    last_val = self.q_table[self.last_state_tuple][c][1]
                    updated_val = (last_val *
                                   (1 - self.alpha)) + (self.alpha * reward)

                    self.q_table[self.last_state_tuple][c] = (last_act,
                                                              updated_val)

        #save last state_tuple, so it can be updated during the next iteration
        self.last_state_tuple = state_tuple
        self.last_action = action

        print "FRAME: ", self.frame
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(
            deadline, env_state, action, reward)  # [debug]