class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, learning_rate=0.6, discount_factor=0.35):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.alpha = learning_rate
        self.gamma = discount_factor
        self.state = None
        self.Q_values = self.initialize_Q_values()
 
    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
       
    def initialize_Q_values(self, val=10.0):
        Q_values = {}
        for waypoint in ['left', 'right', 'forward']:
            for light in ['red', 'green']:
                for action in self.env.valid_actions:
                    Q_values[((waypoint, light), action)] = val
        return Q_values

    def find_max_Q(self, state):
        action = None
        max_Q = 0.0
        for a in self.env.valid_actions:
            Q = self.Q_values[(state, a)]
            if Q > max_Q:
                action = a
                max_Q = Q
        return (max_Q, action)

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (self.next_waypoint, inputs['light'])

        # TODO: Select action according to your policy
        #action = random.choice(self.env.valid_actions) 
        (Q, action) = self.find_max_Q(self.state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        next_waypoint = self.planner.next_waypoint()
        inputs = self.env.sense(self)
        state_prime = (next_waypoint, inputs['light'])
        (Q_prime, action_prime) = self.find_max_Q(state_prime)        
        Q += self.alpha * (reward + self.gamma*Q_prime - Q) 
        self.Q_values[(self.state, action)] = Q

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #2
0
class QLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world by using Q-Learning"""

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint
        self.learner = QLearn(epsilon=1, alpha=0.3, gamma=0.6)        

    def reset(self, destination=None):
        self.planner.route_to(destination)      

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self) 
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # TODO: Select action according to your policy
        action = self.learner.action(self.state)
        
        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        next_inputs = self.env.sense(self) 
        self.future_next_way_out = self.planner.next_waypoint()
        next_state = (next_inputs['light'], next_inputs['oncoming'], next_inputs['left'], self.future_next_way_out)
        #next_state = (next_inputs[0], next_inputs[1], next_inputs[3], self.planner.next_waypoint())
        self.learner.learn(self.state, action, next_state, reward)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #3
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        #Actions
        self.A = ['forward', 'left', 'right', None] #all the actions available
        self.trial = 0

        #Initialize Q table(light, oncoming, next_waypoint)
        self.Q={}
        for i in ['green', 'red']:  #possible lights
            for j in [None, 'forward', 'left', 'right']: #possible oncoming_agent
                for k in ['forward', 'left', 'right']: #possible next_waypoints
                    self.Q[(i,j,k)] = [1] * len(self.A)


    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (inputs['light'], inputs['oncoming'], self.next_waypoint)
        #Find the max Q value for the current state
        max_Q = self.Q[self.state].index(max(self.Q[self.state]))

        #assign action
        p = random.randrange(0,5)
        epsilon = 1.5 # small probability to act randomly
        if p<epsilon:
            action = random.choice(self.env.valid_actions)
        else:
            action = self.A[max_Q]
        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        gamma = 0.30
        alp_tune =500 # tunning parameter
        alpha = 1/(1.1+self.trial/alp_tune) # decay learning rate
        self.trial = self.trial+1

        #get the next action state Q(s',a')
        next_inputs = self.env.sense(self)
        next_next_waypoint = self.planner.next_waypoint()
        next_state = (next_inputs['light'],next_inputs['oncoming'], next_next_waypoint)

        #Update Q Table
        self.Q[self.state][self.A.index(action)] = \
        (1-alpha)*self.Q[self.state][self.A.index(action)] + \
        (alpha * (reward + gamma * max(self.Q[next_state])))
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint

        valid_waypoints = ['forward','right','left']
        valid_lights = ['red','green']
        valid_actions = ['forward','right','left', None]

        self.Q_hat={}
        states = [(next_waypoint,light,oncoming,right,left) for next_waypoint in valid_waypoints for light in valid_lights for oncoming in valid_actions for right in valid_actions for left in valid_actions]
        states.append('Destination')
        for state in states:
            self.Q_hat[state] = {}
            for action in valid_actions:
                self.Q_hat[state][action] = random.random()

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        if self.next_waypoint == None:
            self.state = 'Destination'
        else:
            self.state = (self.next_waypoint, inputs['light'],inputs['oncoming'],inputs['right'],inputs['left'])

        # TODO: Select action according to your policy

        valid_actions = ['forward','right','left', None]
        epsilon = 0.1 #0.01

        best_action = max(self.Q_hat[self.state],key=self.Q_hat[self.state].get)
        random_action = choice(valid_actions)
        action = choice([best_action, random_action],p=[1-epsilon,epsilon])

        # Execute action and get reward

        reward = self.env.act(self, action)

        # Learn policy based on state, action, reward

        new_next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        new_inputs = self.env.sense(self)
        new_state = (new_next_waypoint, new_inputs['light'],new_inputs['oncoming'],new_inputs['right'],new_inputs['left'])
        alpha = 0.5 #opt 0.7
        gamma = 0.5 #opt 0.1
        max_Qhat_ahat = max(self.Q_hat[new_state].values())
        self.Q_hat[self.state][action] = (1-alpha)*self.Q_hat[self.state][action]+alpha*(reward+gamma*max_Qhat_ahat)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #5
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, alpha, q_initial):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.policy = {}
        self.trip_log = []
        self.trip = None
        self.alpha = alpha
        self.q_initial = q_initial

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        if self.trip:
            self.trip_log.append(self.trip)
        self.trip = {}
        self.trip['Deadline'] = self.env.get_deadline(self)
        self.trip['Reward'] = 0
        self.trip['Penalty'] = 0

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        inputs.pop('right')
        self.state = inputs
        self.state['waypoint'] = self.next_waypoint

        # TODO: Select action according to your policy
        state_hash = hashabledict(frozenset(self.state.iteritems()))
        q = self.policy.get(state_hash, {
            None: self.q_initial,
            'forward': self.q_initial,
            'left': self.q_initial,
            'right': self.q_initial,
        })
        action = max(q.iteritems(), key=operator.itemgetter(1))[0]

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Update trip stats
        self.trip['Reward'] += reward
        self.trip['Remaining'] = self.env.get_deadline(self)
        self.trip['Success'] = self.planner.next_waypoint() == None
        if reward < 0: self.trip['Penalty'] += reward

        # TODO: Learn policy based on state, action, reward
        q[action] = (1 - self.alpha) * q[action] + self.alpha * reward
        self.policy[state_hash] = q
예제 #6
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.QLearning = QLearning(epsilon=0.3, alpha=0.95, gamma=0.9)
        # self.QLearning = pickle.load(open('./QLearning.pkl'))

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def is_action_okay(self, inputs):
        action_okay = True
        if self.next_waypoint == 'right':
            if inputs['light'] == 'red' and inputs['left'] == 'forward':
                action_okay = False
        elif self.next_waypoint == 'forward':
            if inputs['light'] == 'red':
                action_okay = False
        elif self.next_waypoint == 'left':
            if inputs['light'] == 'red' or (inputs['oncoming'] == 'forward' or inputs['oncoming'] == 'right'):
                action_okay = False

        return action_okay

    def update(self, t):
        # Gather states
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        action_okay = self.is_action_okay(inputs)
        self.state = (action_okay, self.next_waypoint, deadline < 3)
        # self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # TODO: Select action according to your policy
        action = self.QLearning.policy(self.state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Gather next states
        next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        next_inputs = self.env.sense(self)
        next_deadline = self.env.get_deadline(self)
        next_action_okay = self.is_action_okay(next_inputs)
        next_state = (next_action_okay, next_waypoint, next_deadline < 3)
        # next_state = (next_inputs['light'], next_inputs['oncoming'], next_inputs['left'], next_waypoint)

        # TODO: Learn policy based on state, action, reward
        self.QLearning.update_QTable(self.state, action, reward, next_state)
예제 #7
0
class BasicLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(BasicLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.last_reward = 0
        self.last_action = None
        self.last_state = None
        self.state = None
        self.total_reward = 0
        self.deadline = self.env.get_deadline(self)
        self.actions = ['forward', 'left', 'right', None]
        self.reached_destination = 0
        self.penalties = 0
        self.movements = 0

    def tuple_state(self, state):
        State = namedtuple("State", ["light", "next_waypoint"])
        return State(light = state['light'], next_waypoint = self.planner.next_waypoint()) 

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.last_reward = 0
        self.last_action = None
        self.last_state = None
        self.state = None
        self.total_reward = 0

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        
        # TODO: Update state
        self.state = self.tuple_state(inputs)

        # TODO: Select action according to your policy
        action = random.choice(self.actions)

        # Execute action and get reward
        reward = self.env.act(self, action)

        if reward >= 10: self.reached_destination += 1
        if reward < 0: self.penalties += 1
        self.movements += 1

        # TODO: Learn policy based on state, action, reward 
        self.last_action = action
        self.last_state = self.state
        self.last_reward = reward
        self.total_reward += reward
예제 #8
0
파일: agent.py 프로젝트: qmaruf/smartcab
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here        
        self.actions = ['forward', 'left', 'right',None]        
        self.q_table = Counter()        
        self.alpha = 0.5
        self.gamma = 0.2


    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def get_best_action(self, state):
        best_profit = -999
        best_action = None
        for a in self.actions:
            profit = self.q_table[(state, a)]            
            if profit > best_profit:
                best_profit = profit
                best_action = a
        return best_action, best_profit


    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state        
        self.state = (inputs['light'],inputs['oncoming'], inputs['left'], self.next_waypoint)       

        if randint(0,10) < 3:
            action = self.actions[randint(0, 3)]
        else:        
            action, _ = self.get_best_action(self.state)
        
        reward = self.env.act(self, action)

        next_inputs = self.env.sense(self)
        next_next_waypoint = self.planner.next_waypoint()
        next_state = (next_inputs['light'],next_inputs['oncoming'], next_inputs['left'], next_next_waypoint)      
       
        _, best_profit = self.get_best_action(next_state)
        self.q_table[(self.state, action)] = (1-self.alpha)*self.q_table[(self.state, action)] + (self.alpha * (reward + self.gamma * best_profit))

       
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #9
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.debugOn = False
        self.reporter = Reporter(driver_type, self.env.valid_actions)
        if driver_type=='random':
            self.driver = RandomDriver(self.env.valid_actions)
        elif driver_type=='greedy':
            self.driver = GreedyDriver(self.env.valid_actions)
        elif driver_type=='greedy2':
            self.driver = Greedy2Driver(self.env.valid_actions)
        elif driver_type=='smart':
            self.driver = SmartDriver(self.env.valid_actions, self.reporter)
        elif driver_type=='smart2':
            self.driver = Smart2Driver(self.env.valid_actions, self.reporter)
        elif driver_type=='smart3':
            self.driver = Smart3Driver(self.env.valid_actions, self.reporter)
        else:
            self.driver = Smart2Driver(self.env.valid_actions, self.reporter)

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.reporter.reset(destination)

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = State.make(self.next_waypoint, inputs)
        
        # TODO: Select action according to your policy
        action = self.driver.decide(self.state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        newWaypoint = self.planner.next_waypoint()
        newState = State.make(newWaypoint, self.env.sense(self))
        self.driver.learn(t, deadline, self.state, action, reward, newState)

        location = self.env.agent_states[self]['location']
        if self.debugOn:
            print "LearningAgent.update(): deadline={} inputs={} action={} reward={} location={}".format(deadline, inputs, action, reward, location)  # [debug]
        self.reporter.update(t, deadline, self.state, action, reward, location, self.driver)
예제 #10
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Q = {}
        self.pi = {}
        self.alpha = 1
        self.gema = 0
        self.ACTIONS = [None, 'forward', 'left', 'right']

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = inputs
        self.state['next_waypoint'] = self.next_waypoint
        self.state = tuple(self.state.items())
        # TODO: Select action according to your policy

        action = random.choice(self.ACTIONS) if self.state not in self.pi else self.pi[self.state]
        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        next_state = self.env.sense(self)
        next_state['next_waypoint'] = self.planner.next_waypoint()
        next_state = tuple(next_state.items())
        next_action = random.choice(self.ACTIONS) if next_state not in self.pi else self.pi[next_state]
        sta_act = (self.state, action)
        self.Q[sta_act] = (1 - self.alpha) * self.Q.get(sta_act, 0) + self.alpha*(reward + self.gema*self.Q.get((next_state, next_action), 0))
        acts_Q = [self.Q.get((self.state, act)) for act in self.ACTIONS]
        if None not in acts_Q:
            max_Q = max(acts_Q)
            max_acts = [act for act in self.ACTIONS if max_Q==self.Q.get((self.state, act))]
            self.pi[self.state] = random.choice(max_acts)
        #print self.Q
        #print 
        #print self.pi
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #11
0
파일: QLearn.py 프로젝트: mic0331/smartcab
class QLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world by using Q-Learning"""

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.possible_actions= Environment.valid_actions
        self.ai = QLearn(epsilon=.05, alpha=0.1, gamma=0.9)        

    def reset(self, destination=None):
        self.planner.route_to(destination)      

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self) 
        inputs = inputs.items()
        deadline = self.env.get_deadline(self)

        self.state = (inputs[0],inputs[1],inputs[3],self.next_waypoint)
        
        action = self.ai.Q_move(self.state)
        
        # Execute action and get reward
        reward = self.env.act(self, action)

        inputs2 = self.env.sense(self) 
        inputs2 = inputs2.items()
        next_state = (inputs2[0],inputs2[1],inputs2[3],self.next_waypoint)

        self.ai.Q_post(self.state, action, next_state, reward)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #12
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        # sets self.env = env, state = None, next_waypoint = None, and a default color
        super(LearningAgent, self).__init__(env)
        self.color = 'red'  # override color
        # simple route planner to get next_waypoint
        self.planner = RoutePlanner(self.env, self)
        # TODO: Initialize any additional variables here

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        # # from route planner, also displayed by simulator
        self.next_waypoint = self.planner.next_waypoint()
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = self.next_waypoint, inputs['light']
        # self.state = self.next_waypoint, (inputs['oncoming'] == None or inputs['light'] == 'green')

        # TODO: Select action according to your policy
        # Define actions
        actions = [None, 'forward', 'left', 'right']

        # QUESTION 1- select random action
        action = random.choice(actions)

        # Execute action and get reward
        reward = self.env.act(self, action)
예제 #13
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.alpha = 0.2
        self.Qtable = {}
        

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.reward = 0
        
        
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state

        # Inform Driving Agent
        self.state = (inputs['light'], inputs['oncoming'], 
                      inputs['left'], inputs['right'], self.next_waypoint)
        
        
        # TODO: Select action according to your policy
        valid_actions = [None, 'forward', 'left', 'right']
        
        # Random Pick Actions
        ### action = random.choice(valid_actions)
        
        # Get all Q values for all state
        all_actions = {action: self.Qtable.get((self.state, action), 0)
                        for action in valid_actions}
                           
        # Find action that maximizes Q values
        best_actions = [action for action in valid_actions 
                        if all_actions[action] == max(all_actions.values())]
           
        # Return Best Action
        action = random.choice(best_actions)    
         
        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        # To simplify the problem, I set gamma equals 0.
        self.Qtable[(self.state, action)] = \
        (1 - self.alpha) * self.Qtable.get((self.state, action), 0) + self.alpha * reward
     
           
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #14
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        
        # TODO: Select action according to your policy
        action = None

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        print("LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward))  # [debug]
예제 #15
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.q_matrix = {}
        self.alpha = 0.95
        self.gamma = 0.05
        self.explore = 0.05
        self.state = None

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        deadline = self.env.get_deadline(self)

        state = self.observe_state()
        self.state = state
        
        knowledge = self.knowledge_from_state(state)

        # TODO: Select action according to your policy
        if random() < self.explore:
            action = choice(OPTIONS)
        else:
            action = sorted(knowledge.items(), key=lambda x: x[1], reverse=True)[0][0]

        old_value = knowledge[action]

        # Execute action and get reward
        reward = self.env.act(self, action)
        new_state = self.observe_state()
        new_state_knowledge = self.knowledge_from_state(new_state)

        self.q_matrix[state][action] = (1-self.alpha) * old_value +\
                                       self.alpha * (reward + self.gamma * max(new_state_knowledge.values()))
        

        #print "LearningAgent.update(): deadline = {}, state = {}, action = {}, reward = {}".format(deadline, state, action, reward)  # [debug]

    def knowledge_from_state(self, state):
        """
        Gets what we know about the given state
        """
        if not state in self.q_matrix:
            self.q_matrix[state] = {option: 0 for option in OPTIONS}
        return self.q_matrix[state]

    def observe_state(self):
        """
        Observe the current state
        """
        inputs = self.env.sense(self)
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        return (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)
예제 #16
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        # location, heading, delta, deadline, oncoming, traffic light state
        inputs['waypoint'] = self.next_waypoint

        self.state = tuple(sorted(inputs.items())) # performs the actual update

        # TODO: Select action according to your policy
        actions = [None,'forward','left','right']
        action = actions[random.randint(0,3)]

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
class BasicAgent(Agent):
    def __init__(self, env):
        super(BasicAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.color = 'red'  # override color

    def reset(self, destination=None):
        self.planner.route_to(destination)  
        
    def update(self, t):
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        action = random.choice(self.env.valid_actions)
        reward = self.env.act(self, action)
        
        light = inputs['light']
        oncoming = inputs['oncoming']
        left = inputs['left']
        right = inputs['right']
       
        #Update the current observed state
        self.state = (light, oncoming, left, right, self.next_waypoint)
        
        print "Basic.update(): next move = {}".format(action)  # [debug]
class LearningAgent(Agent):
    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.next_waypoint = None
        self.total_reward = 0

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.next_waypoint = None

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Select action according to your policy
        loc, hed = self.get_my_location()
        action = self.get_next_waypoint_given_location( loc, hed)
        action_okay = self.check_if_action_is_ok(inputs)
        if not action_okay:
            action = None
        # Execute action and get reward
        reward = self.env.act(self, action)
        self.total_reward += reward

        #TODO: Learn policy based on state, action, reward
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #19
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here



    def reset(self, destination=None):
        self.planner.route_to(destination)



    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = inputs['location']
        self.heading=inputs['heading']

        action = random.choice(Environment.valid_actions)
        reward = self.env.act(self, action)
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs,
                                                                                                    action,
                                                                                                    reward)  # [debug]
예제 #20
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    
    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.successes=0
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint        
        self.learner=Q_Learner()
        self.mapper= StateMapper(self.learner)                        
        self.actionChooser= QValueRandomMixActionChooser(self.learner)
        self.negative_rewards=0

    def reset(self, destination=None):
        self.learner.prior_state=None
        self.actionChooser.first=True        
        self.planner.route_to(destination)
        
    def update(self, t):
        # Gather inputs
        learner= self.learner        
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        if learner.prior_state==None:
            learner.prior_state= self.mapper.mapDataToState(self.next_waypoint,inputs,deadline)           
            self.state=learner.prior_state[1]
        else:        
            learner.update()
            learner.prior_state=learner.current_state                
        
        # Select action according to your policy
        action= self.actionChooser.chooseAction()
        learner.action=action
        # Execute action and get reward
        learner.reward = self.env.act(self, action)
        if learner.reward<0:
            self.negative_rewards+=learner.reward
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        location = self.env.agent_states[self]["location"] 
        destination = self.env.agent_states[self]["destination"]
        inputs = self.env.sense(self)
        learner.current_state= self.mapper.mapDataToState(self.next_waypoint,inputs,deadline)
        self.state=learner.current_state[1]
        #keep track of how often we were successful for later evaluation        
        if location==destination:
            self.successes= self.successes+1
예제 #21
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.prevReward = 0
        self.prevAction = None

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.prevReward = 0
        self.prevAction = None

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        state = None
        validActions = []
        
        if inputs['light'] == 'red':
            if inputs['oncoming'] != 'left':
                validActions = ['right', None]
        else:
            if inputs['oncoming'] == 'forward':
                validActions = [ 'forward','right']
            else:
                validActions = ['right','forward', 'left']

        # TODO: Select action according to your policy
        if validActions != []:
            action = random.choice(validActions)
        else:
            action = random.choice(Environment.valid_actions)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        if action != None:
            if reward > self.prevReward:
                self.state = (inputs,self.next_waypoint,deadline) 
                self.prevAction = action
                self.prevReward = reward
            else:
                self.state = (inputs,self.next_waypoint,deadline)
                self.prevAction = action
                self.prevReward = reward


        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #22
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        # sets self.env = env, state = None, next_waypoint = None, and a default color
        super(LearningAgent, self).__init__(env)
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.qtable = defaultdict(dict)
        self.epsilon = .4  # Randomness factor
        self.alpha = .4  # Learning rate.

    def reset(self, destination=None):
        # Prepare for a new trip; reset any variables here, if required
        self.planner.route_to(destination)

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # Update state
        self.state = self.define_state(inputs, self.next_waypoint)

        # Select action according policy
        action = self.select_action()

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Learn policy based on state, action, reward
        self.update_qtable(action, reward)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs,
                                                                                                    action,
                                                                                                    reward)  # [debug]

    def select_action(self):
        if random.random() < self.epsilon or self.state not in self.qtable:
            action = random.choice(self.env.valid_actions)
        else:
            action = max(self.qtable[self.state], key=self.qtable[self.state].get)

        if self.state not in self.qtable:
            for a in self.env.valid_actions:  # init action:reward dict.
                self.qtable[self.state][a] = 0
        return action

    def update_qtable(self, action, reward):
        old_q = self.qtable[self.state].get(action, 0)
        self.qtable[self.state][action] = ((1 - self.alpha) * old_q) + (self.alpha * reward)
        # Decay alpha and epsilon until it reaches 0.1 and 0.0, respectively.
        self.alpha = max(0.001, self.alpha - 0.002)
        self.epsilon = max(0.0, self.epsilon - 0.002)

    @staticmethod
    def define_state(inputs, next_waypoint):
        return tuple(inputs.values()) + (next_waypoint, )
예제 #23
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Qtable = {}
        self.alpha = 0.8
        self.gamma = 0.6

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (self.next_waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right'])
        old_state = self.state
        
        # TODO: Select action according to your policy
        if (old_state, None) not in self.Qtable.keys():
            self.Qtable[(old_state, None)] = 0
        if (old_state, 'left') not in self.Qtable.keys():
            self.Qtable[(old_state, 'left')] = 0
        if (old_state, 'forward') not in self.Qtable.keys():
            self.Qtable[(old_state, 'forward')] = 0
        if (old_state, 'right') not in self.Qtable.keys():
            self.Qtable[(old_state, 'right')] = 0
        
        max_action = max(self.Qtable[(old_state, None)], self.Qtable[(old_state, 'left')],self.Qtable[(old_state, 'forward')],self.Qtable[(old_state, 'right')])
        action = random.choice([ i for i in ['left', 'forward', 'right', None] if self.Qtable[(old_state, i)] == max_action])

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        #self.next_waypoint = self.planner.next_waypoint()
        inputs = self.env.sense(self)
        self.state = (self.next_waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right'])
        new_state = self.state

        if (new_state, None) not in self.Qtable.keys():
            self.Qtable[(new_state, None)] = 0
        if (new_state, 'left') not in self.Qtable.keys():
            self.Qtable[(new_state, 'left')] = 0
        if (new_state, 'forward') not in self.Qtable.keys():
            self.Qtable[(new_state, 'forward')] = 0
        if (new_state, 'right') not in self.Qtable.keys():
            self.Qtable[(new_state, 'right')] = 0

        self.Qtable[(old_state, action)] = (1 - self.alpha) * self.Qtable[(old_state, action)] + self.alpha *(reward + self.gamma * max(self.Qtable[(new_state, i)] for i in [None, 'left', 'forward', 'right']))
예제 #24
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.actions = [None, 'forward', 'left', 'right']
        self.learning_rate = 0.3
        self.state = None
        self.q = {}
        self.trips = 0

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # Prepare for a new trip; reset any variables here, if required
        self.trips += 1
        if self.trips >= 100:
            self.learning_rate = 0

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # Update state
        state = (inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'],
self.next_waypoint)
        self.state = state

        # Lazy initialization of Q-values
        for action in self.actions:
            if (state, action) not in self.q:
                self.q[(state, action)] = 1
        
        # Select action according to the policy

        probabilities = [self.q[(state, None)], self.q[(state, 'forward')], self.q[(state, 'left')], self.q[(state, 'right')]]
        # Use the softmax funtion so that the values actually behave like probabilities
        probabilities = np.exp(probabilities) / np.sum(np.exp(probabilities), axis=0)

        adventurousness = max((100 - self.trips) / 100, 0)
        if random.random() < adventurousness:
            action = np.random.choice(self.actions, p=probabilities)
        else:
            action = self.actions[np.argmax(probabilities)]

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Learn policy based on state, action, reward
        self.q[(state, action)] = self.learning_rate * reward + (1 - self.learning_rate) * self.q[(state, action)]

        #print("LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward))  # [debug]

    def get_state(self):
        return self.state
예제 #25
0
파일: q_agent.py 프로젝트: xkeecs/smartCab
class QLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.total_reward = 0
        self.next_waypoint = None
        self.QLearner = QAlgorithm(epsilon=0.03, alpha = 0.1, gamma = 0.9)
        self.next_state = None
        self.time_step = 1.0
    
    def get_decay_rate(self, t): #Decay rate for epsilon
       return 1.0 / float(t)
       
    def reset(self, destination=None):
        self.planner.route_to(destination)
        self.time_step = 1.0
        # TODO: Prepare for a new trip; reset any variables here, if required
        '''
        self.previous_action = None
        self.next_state = None
        self.total_reward = 0
        self.next_waypoint = None
        self.QLearner = QAlgorithm(epsilon=0.03, alpha = 0.1, gamma = 0.9)
        self.next_state = None
        '''
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        #env_states = self.env.agent_states[self]
        self.time_step += 1
        epsilon = self.get_decay_rate(self.time_step)
        self.QLearner.set_eps(epsilon)
        
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)
        #self.state = (inputs['light'], self.next_waypoint)

        action = self.QLearner.q_move(self.state)        
        
        # Execute action and get reward
        reward = self.env.act(self, action)
        self.total_reward += reward
        
        new_inputs = self.env.sense(self)
        #env_states = self.env.agent_states[self]
        new_deadline = self.env.get_deadline(self)
        self.next_state =  (new_inputs['light'], new_inputs['oncoming'], new_inputs['left'], self.next_waypoint)
        #self.next_state =  (new_inputs['light'], self.next_waypoint)
        
        # TODO: Learn policy based on state, action, reward

        self.QLearner.q_future_reward(self.state, action, self.next_state, reward)
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #26
0
class BasicAgent(Agent):
    """A basic agent upon which to build learning agents."""

    def __init__(self, env):
        super(BasicAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.qvals = {} # mapping (state, action) to q-values
        self.time = 0 # number of moves performed
        self.possible_actions = (None, 'forward', 'left', 'right')
        self.n_dest_reached = 0 # number of destinations reached
        self.last_dest_fail = 0 # last time agent failed to reach destination
        self.sum_time_left = 0 # sum of time left upon reaching destination over all trials
        self.n_penalties = 0 # number of penalties incurred
        self.last_penalty = 0 # last trial in which the agent incurred in a penalty

    def reset(self, destination=None):
        self.planner.route_to(destination)

    def best_action(self, state):
        """
        Return a random action (other agents will have different policies)
        """
        return random.choice(self.possible_actions)

    def update_qvals(self, state, action, reward):
        """
        Keeps track of visited (state, action) pairs. 
        (in other agents will use reward to update 
        the mapping from (state, action) pairs to q-values)
        """
        self.qvals[(state, action)] = 0
    
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        
        # update time
        self.time += 1

        # Update state
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'],
                      self.next_waypoint)

        # Pick an action
        action = self.best_action(self.state)
        
        # Execute action and get reward
        reward = self.env.act(self, action)
        if reward < 0:
            self.n_penalties += 1
        
        # Update the q-value of the (state, action) pair
        self.update_qvals(self.state, action, reward)
        
        # print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #27
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        
        # TODO: Initialize any additional variables here
        self.acts = ['left', 'right', 'forward', 'stay']
        df_tmp = pd.DataFrame(0*np.ones((2, 4)), index=['red', 'green'], columns=self.acts)
        self.Q = df_tmp
        self.deadline = 0
        

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        self.deadline = deadline

        # TODO: Update state
        self.state = inputs['light'] # in this case the state is simply defined by the color oof the traffic light at the intersection
        
        # TODO: Select action according to your policy
        # Q-Learner
        act_opt = self.Q.ix[self.state].argmax()
        if act_opt=='stay': act_opt=None
#            print "Current state = %s, qlearner action = %s" % (self.state, act_opt)
        action = act_opt

        # Execute action and get reward
        reward = self.env.act(self, action)


        # TODO: Learn policy based on state, action, reward

        # Maximize Q conditional on the state we are in = S
        q_S = self.Q.ix[self.state] # Q conditional on state S
        q_S_mx = q_S[q_S==q_S.max()] # just in case there are multiple values and the action that maximizes Q(S, a) is not unique
        ix_mx = np.random.randint(0,len(q_S_mx)) # pick one of the equally qlearner actions
        q_S_mx = q_S_mx.iloc[ix_mx]

        # Update Q
        act_tmp = action if action !=None else 'stay'
        global alpha, alpha0, gamma
        alpha *= alpha0
        self.Q.ix[self.state, act_tmp] = \
            (1-alpha)*self.Q.ix[self.state, act_tmp] + \
            alpha*(reward + gamma*q_S_mx) # note: we update the action that has been actually taken, but we update the contribution to Q(S, a) by the action that could have been taken had we behaved qlearnerly
예제 #28
0
class QLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Q = {}
        self.learning_rate = 0.9
        self.states = None
        self.possible_actions = [None, 'forward', 'left', 'right']
        

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)



        # TODO: Update state
        # state = (self.next_waypoint,inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'],deadline)
        state = (self.next_waypoint,inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'])
        # state = (self.next_waypoint,inputs['light'])
        self.state = state

        for action in self.possible_actions:
            if(state, action) not in self.Q: 
                self.Q[(state, action)] = 20

        # TODO: Select action according to your policy
        Q_best_value = [self.Q[(state, None)], self.Q[(state, 'forward')], self.Q[(state, 'left')], self.Q[(state, 'right')]]
        # Softmax
        Q_best_value = np.exp(Q_best_value) / np.sum(np.exp(Q_best_value), axis=0)
        
        #action = np.random.choice(self.possible_actions, p=probabilities)
        action = self.possible_actions[np.argmax(Q_best_value)]

        # Execute action and get reward
        reward = self.env.act(self, action)
        if reward == -10.0 :
            print "Invalid move"
        if reward == -0.5 :
            print "valid, but is it correct?"
        

        # TODO: Learn policy based on state, action, reward
        # self.Q[(state,action)] = reward + self.learning_rate * self.Q[(state,action)]
        self.Q[(state,action)] = reward * self.learning_rate + ( 1 - self.learning_rate ) * self.Q[(state,action)]
예제 #29
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, learning_rate=0.78, discount_factor=0.36):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Q = {}
        self.discount_factor = discount_factor
        self.learning_rate = learning_rate
        self.actions = Environment.valid_actions
        self.oldAction = None


    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.oldAction=None
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        #------------------------------------------------
        #action = np.random.choice(self.actions)
        #------------------------------------------------
        


        # TODO: Select action according to your policy
        current_state = tuple(inputs.values() + [self.next_waypoint])
        self.state = (inputs, self.next_waypoint)
        
        if current_state not in self.Q:
        	self.Q[current_state] = [3,3,3,3]

        Q_max = self.Q[current_state].index(np.max(self.Q[current_state]))
        # Execute action and get reward
        action = self.actions[Q_max]
        reward = self.env.act(self, action)



        # TODO: Learn policy based on state, action, reward
        
        if t!=0:
			oldvalue = self.Q[self.oldAction[0]][self.oldAction[1]]
			newvalue = ((1 - self.learning_rate)*oldvalue)+(self.learning_rate * (self.oldAction[2] + self.discount_factor * self.Q[current_state][Q_max]))
			self.Q[self.oldAction[0]][self.oldAction[1]] = newvalue 
        
        self.oldAction = (current_state, self.actions.index(action), reward)
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
예제 #30
0
파일: agent.py 프로젝트: llathrop/smartcab
class RandomAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(RandomAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.availableAction = ['forward', 'left', 'right',None]   
        self.goal=0
        self.steps=0
        self.features=[]
        self.deadline=[]
        self.total_reward=[0]

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        #print"RESET, Final state:\n", self.state
        try:
            if self.deadline[len(self.features)-1] >0: #deadline less than zero
                self.goal+=1 #FIXME - order
                print "PASS! {} steps to goal,Goal reached {} times out of {}!".format(
                                                        self.deadline[len(self.features)-1],self.goal,len(self.features))
            else:
                print "FAIL! {} steps to goal,Goal reached {} times out of {}!".format(
                                                        self.deadline[len(self.features)-1],self.goal,len(self.features))
                pass
        except:
            print "Trial 0 - Goal reached {} times out of {}!".format(self.goal,len(self.features))
            pass
        print "----------------------------------------------------------"
        self.features.append({})
        self.deadline.append(None)
        self.total_reward.append(0)
        self.steps=0

    def update(self, t):
        # Gather inputs
        self.steps+=1
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        #self.deadline[len(self.features)] = self.env.get_deadline(self)
        self.state=inputs
        self.features[len(self.features)-1][self.steps]=inputs
        self.deadline[len(self.deadline)-1] = self.env.get_deadline(self)

        # TODO: Select action according to your policy
        action = self.availableAction[random.randint(0,3)]    
        
        # Execute action and get reward
        reward = self.env.act(self, action)
        self.lastReward=reward
        # TODO: Learn policy based on state, action, reward
        self.total_reward[len(self.total_reward)-1] =self.total_reward[len(self.total_reward)-1]+reward
예제 #31
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """ 

    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent, self).__init__(env)     # Set the agent in the evironment 
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning # Whether the agent is expected to learn
        self.Q = dict()          # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon   # Random exploration factor
        self.alpha = alpha       # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        
        #self.a = 0.01
        self.t = 0



    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)
        
        ########### 
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        
        self.t = self.t+1
        
        if testing:
            self.epsilon = 0;
            self.alpha = 0;
        else:
            #self.epsilon = self.epsilon - 0.05
            #self.epsilon = 0.98**self.t
            #self.epsilon =1.0/(self.t**2)
            #self.epsilon = math.exp(-(0.01*self.t))
            self.epsilon = math.cos(0.003 * self.t)
            #self.epsilon = 1;
            
     
        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint() # The next waypoint 
        inputs = self.env.sense(self)           # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ########### 
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent        
        state = (waypoint, inputs['light'], inputs['left'], inputs['right'], inputs['oncoming'])
        
        #if self.learning:
            #self.createQ(state)


        return state


    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ########### 
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        
        ## My example for easy understanding
        #Q = {'left':10.0, 'right':11.0, 'forward':-5.0, None :0.0 }
        #max(Q)---> right
        
        #maxQ = -1000.0
        #for action in self.Q[state]:
        #    if maxQ < self.Q[state][action]:
        #        maxQ = self.Q[state][action]
        
        #maxQ = max(self.Q[state][action] for action in self.valid_actions)
        
        maxQ = max(self.Q[state].values())        #max(self.Q[state])  
        
        return maxQ 


    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ########### 
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        
        #if self.learning:
        #    if state not in self.Q:
        #        self.Q[state] = {'left':0.0, 'right':0.0, 'forward':0.0, None :0.0 }
        
        if (self.learning) and (state not in self.Q):
            self.Q[state] = {}
            for action in self.valid_actions:
                self.Q[state][action] = 0.0
                    
             #self.Q[state] =  self.Q.get(state, {None : 0.0 , 'left' : 0.0 , 'right' : 0.0 , 'forward' : 0.0 })
       

        return


    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ########### 
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state
        if self.learning:
                
            if self.epsilon > random.random():
                action = random.choice(self.valid_actions)
                
            else:
                maxQ = self.get_maxQ(state)
                action = random.choice([action for action in self.valid_actions if self.Q[state][action]==maxQ])
            
                #maxActions = []
                #for action_key in self.Q[state]:
                #    if self.Q[state][action_key] == maxQ:
                #        maxActions.append(action_key)
                #action = random.choice(maxActions)
                
        else:
            action = random.choice(self.valid_actions)
 
        return action


    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ########### 
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        
        #inputs_new = self.env.sense(self)
        #state_new = (self.planner.next_waypoint(), inputs_new['light'], inputs_new['oncoming'], inputs_new['left'], inputs_new['right'])
        #self.q_table[self.state][action] = (1 - alpha) * self.q_table[self.state][action] + alpha * (reward + gamma * max(self.q_table[state_new]))
        
        if self.learning:
            self.Q[state][action] = ( 1- self.alpha) * self.Q[state][action] + self.alpha * reward
            
  
        return


    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()          # Get current state
        self.createQ(state)                 # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action) # Receive a reward
        self.learn(state, action, reward)   # Q-learn

        
        return
예제 #32
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env, gamma, alpha):
        # sets self.env = env, state = None, next_waypoint = None, and a default color
        super(LearningAgent, self).__init__(env)
        self.color = 'red'  # override color
        # simple route planner to get next_waypoint
        self.planner = RoutePlanner(self.env, self)
        # Initialize additional variables and QLearning here
        self.actions = ['forward', 'left', 'right', None]
        self.states = []
        self.q_learn = QLearning(self.actions,
                                 states=self.states,
                                 gamma=gamma,
                                 alpha=alpha)
        self.p_action = None
        self.p_reward = None

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # Prepare for a new trip; reset any variables here, if required

    def get_state_code(self, inputs):
        dic = {
            'red': 1,
            'green': 2,
            'forward': 1,
            'left': 2,
            'right': 3,
            None: 4
        }
        state_code = 0
        for i in xrange(len(inputs)):
            state_code += dic[inputs[i]] * 10**i
        return state_code

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # Get previous state and action taken in state
        p_state, p_action, p_reward = self.state, self.p_action, self.p_reward

        # Get current state
        state = self.get_state_code([self.next_waypoint] + inputs.values())

        # if state is observed for the first time add it to Q and states list
        self.q_learn.add_new_state(state)

        # To avoid crash at first run when p_state == p_action == p_reward == None
        if p_state is not None:
            # Learn policy based on state, action, reward
            self.q_learn.update_Q(p_reward=p_reward,
                                  p_state=p_state,
                                  p_action=p_action,
                                  c_state=state)

        # Select action according to your policy
        action = self.q_learn.get_best_action(state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # sets previous state and action based for next state
        self.p_action, self.state, self.p_reward = action, state, reward

        global total_actions, total_rewards
        total_actions += 1
        total_rewards += reward
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(
            deadline, inputs, action, reward)  # [debug]
예제 #33
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing:
            self.alpha = 0
            self.epsilon = 0
        else:
            self.epsilon = max(
                self.epsilon - 0.01,
                0.0)  # Make epsilon smaller as the number of trials increases

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        # When learning, check if the state is in the Q-table
        #   If it is not, create a dictionary in the Q-table for the current 'state'
        #   For each action, set the Q-value for the state-action pair to 0

        state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left'],
                 inputs['right'])

        return state

    def get_maxQ_action(self, state):
        """ The get_max_Q_action function is called when the agent is asked to find the
            action with the maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Find the action that gives the maximum Q-value of all actions for a given state

        state_dict = self.Q[state]

        # find the max Q value
        maxQ = max(state_dict.values())

        # choose a random action out of all actions with the highest Q
        return random.choice(
            [item[0] for item in state_dict.items() if item[1] >= maxQ])

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        if (self.learning and state not in self.Q):
            self.Q[state] = {action: 0.0 for action in self.valid_actions}

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state
        action = random.choice(self.valid_actions) if (
            not self.learning) else (random.choice(self.valid_actions) if
                                     (random.uniform(0, 1) < self.epsilon) else
                                     self.get_maxQ_action(state))

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            self.Q[state][action] = (
                1 - self.alpha) * self.Q[state][action] + self.alpha * reward
        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
예제 #34
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        # Set any additional class parameters as needed
        self.t = 0
        random.seed(42)

    def linear_decay(self):
        """
        A linear decaying function for the initial Q-Learning implementation
        """
        return self.epsilon - 0.05

    def optimized_decay(self):
        """
        A optimized version of the decaying function
        """
        return math.pow(math.e, -.005 * self.t)

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        self.t += 1

        if testing:
            self.epsilon = 0
            self.alpha = 0
        else:
            self.epsilon = self.optimized_decay()

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        # Set 'state' as a tuple of relevant data for the agent
        return (inputs['oncoming'], inputs['left'], inputs['light'], waypoint)

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """
        # Calculate the maximum Q-value of all actions for a given state
        qs = self.Q[state]
        return max(qs[action] for action in self.valid_actions)

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        if self.learning and state not in self.Q:
            self.Q[state] = {action: 0.0 for action in self.valid_actions}

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state
        if not self.learning:
            action = random.choice(self.valid_actions)
        else:
            if self.epsilon > random.random():
                action = random.choice(self.valid_actions)
            else:
                maxQ = self.get_maxQ(state)
                best_actions = [
                    action for action, q in self.Q[state].iteritems()
                    if maxQ == q
                ]
                action = random.choice(best_actions)
        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        # When learning, implement the value iteration update rule
        #    Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            q = self.Q[state][action]
            self.Q[state][action] = q + self.alpha * (reward - q)

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
예제 #35
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.testing = False
        self.t = 0.0  # t - number of tries

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        self.testing = testing
        self.t = self.t + 1.0

        if self.testing:
            self.epsilon = 0.0
            self.alpha = 0.0
        else:
            # First Q-learning implementation: epsilon = epsilon - 0.05
            # self.epsilon = self.epsilon - 0.05

            # Improved Q-learning implementation: epsilon = 1 / (t*t)
            self.epsilon = 1.0 / (self.t * self.t)  # 1 / t*t

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########

        # NOTE : you are not allowed to engineer eatures outside of the inputs available.
        # Because the aim of this project is to teach Reinforcement Learning, we have placed
        # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation.
        # With the hand-engineered features, this learning process gets entirely negated.

        # Set 'state' as a tuple of relevant data for the agent
        state = (waypoint, inputs)  #None

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        # recommended method per code review
        maxQ = max(self.Q[str(state)].values())
        """
        maxQ = 0.0
        state_str = str(state)
        if  state_str in self.Q.keys():
            adict = self.Q[state_str]
            print("Q-list: state_str  ", state_str)
            for key in adict.keys():
                value = adict[key]
                print("Q-list: action = ", key, " ,value = ", value) 
                if value > maxQ:
                    maxQ = value
        else:
            print("state_str = ", state_str, " not in Q list!")
        """

        #maxQ = None
        print("max Q value = ", maxQ)

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        # Code review comment: Make sure your Q-Table is ONLY initialized when Learning == True
        if (not self.learning):
            return

        # convert state to string
        state_str = str(state)
        if state_str in self.Q.keys():
            return
        else:
            dict_item = dict()
            for action in self.valid_actions:
                dict_item[action] = 0.0
            self.Q[state_str] = dict_item

        return

    def getQ(self, state, action):
        """ Yuefeng add this function for calculating running average of rewards. """
        qvalue = 0.0
        state_str = str(state)
        if state_str in self.Q.keys():
            vdict = self.Q[state_str]
            if action in vdict.keys():
                qvalue = vdict[action]
            else:
                print("getQ: action not in vdict!!!")
        else:
            print("getQ: state not in self.Q!!!")

        return qvalue

    def setQ(self, state, action, qvalue):
        """ Yuefeng add this function for calculating running average of rewards and then updating Q value. """
        state_str = str(state)
        if state_str in self.Q.keys():
            vdict = self.Q[state_str]
            if action in vdict.keys():
                vdict[action] = qvalue
            else:
                print("setQ: action not in vdict!!!")
        else:
            print("setQ: state not in self.Q!!!")

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie".
        state_str = str(state)
        action_index = random.randint(0, 3)
        if (not self.learning):
            action = self.valid_actions[action_index]
        else:
            random_number = random.uniform(0.0, 1.0)
            if random_number <= self.epsilon:
                action = self.valid_actions[action_index]
                print("exploration, action = ", action)
            else:
                # recommended method per code review
                best_actions = [
                    action for action in self.valid_actions if self.Q[str(
                        state)][action] == max(self.Q[str(state)].values())
                ]
                action = random.choice(best_actions)
                """
                maxQ = self.get_maxQ(state)
                best_actions = []
                dict_item = self.Q[state_str]
                for action in self.valid_actions:
                    qvalue = dict_item[action]
                    if qvalue >= maxQ:
                        best_actions.append(action)
                number_of_actions = len(best_actions)
                if number_of_actions < 1:
                    print("choose_action error: best actions list is empty!")
                    action = None
                elif number_of_actions == 1:
                    action = best_actions[0]
                    print("only one best action = ", action)
                else:
                    best_action_index = random.randint(0, number_of_actions - 1)
                    action = best_actions[best_action_index]
                    print("multiple best actions, randomly selected action = ", action)
                """

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives a reward. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        state_str = str(state)
        if self.learning:

            # get the previous Q value for the pair of (state, action)
            qvalue = self.getQ(state, action)
            print("current Q value = ", qvalue)

            # get alpha
            alpha = self.alpha

            # calculate new Q value
            new_qvalue = (1.0 - alpha) * qvalue + alpha * reward
            print("new Q value = ", new_qvalue)

            # update Q value
            self.setQ(state, action, new_qvalue)

        else:
            None

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
예제 #36
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.cnt = 1

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        # linear
        # self.epsilon -= 0.05
        # e^t
        # self.epsilon = math.exp(-.08*self.cnt) #
        # 1/t^2
        # self.epsilon = 100 * math.pow((1/self.cnt), 2)  #
        # a^t
        # self.epsilon = math.pow(.991, self.cnt) #
        # self.alpha = 1 - math.pow((1/self.cnt), 2)
        # cos
        if self.cnt < 300:
            self.epsilon = .6 + .2 * math.cos(.6 * self.cnt)
        else:
            self.epsilon = math.pow(.991, self.cnt)
        # Update additional class parameters as needed
        self.cnt += 1
        # If 'testing' is True, set epsilon and alpha to 0

        if testing:
            self.epsilon = 0
            self.alpha = 0
        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent

        state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left'],
                 inputs['right'])

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        maxQ = max(self.Q[state].values())

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning and not self.Q.has_key(state):
            self.Q[state] = {None: 0., 'forward': 0., 'left': 0., 'right': 0.}
        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state
        if self.learning:
            if random.uniform(0, 1) < self.epsilon:
                action = random.choice(self.valid_actions)
            else:
                action_list = []
                mQ = self.get_maxQ(state)
                # for k, v in self.Q[state].items():
                #     if v == mQ:
                #         action_list.append(k)
                # action = random.choice(action_list)
                max_keys = [k for k, v in self.Q[state].items() if v == mQ]
                action = random.choice(max_keys)
        #  not learning
        else:
            action = random.choice(self.valid_actions)
        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            old_q = self.Q[state][action]
            pred_q = reward
            self.Q[state][action] += self.alpha * (pred_q - old_q)

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
예제 #37
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.train_no = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        import math
        self.train_no = self.train_no + 1
        # self.epsilon = math.e**(-0.01 * self.train_no)
        # self.alpha = math.e**(-0.001 * (self.train_no))
        # self.epsilon = self.epsilon - 0.05*self.train_no
        # self.alpha = self.alpha - 0.05*self.train_no
        # self.alpha -= 0.005

        # for Question 5, decay function
        self.epsilon -= 0.05

        # for Question 6, use exponential function
        # self.epsilon = 0.7**self.train_no
        # results
        # Trial Aborted!
        # Agent did not reach the destination.

        # for Question 6, use 1/train_no/train_no
        # self.epsilon = 1/self.train_no **2
        # results
        # Trial Aborted!
        # Agent did not reach the destination.

        # for Question 6, use exponential function
        # self.epsilon = math.e**(-0.01 * self.train_no)
        # self.alpha = math.e**(-0.001 * (self.train_no))did not use
        # results
        # Trial Completed!
        # Agent reached the destination.

        if testing:
            self.epsilon = 0
            self.alpha = 0

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the
            environment. The next waypoint, the intersection inputs, and the deadline
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########

        # NOTE : you are not allowed to engineer eatures outside of the inputs available.
        # Because the aim of this project is to teach Reinforcement Learning, we have placed
        # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn
        # about the balance between exploration and exploitation.
        # With the hand-engineered features, this learning process gets entirely negated.

        # Set 'state' as a tuple of relevant data for the agent
        state = (inputs['light'], inputs['oncoming'], inputs['left'], waypoint)

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        # q = [self.Q.get((state,action), 0.0) for action in self.valid_actions]
        q = [self.Q[state][action] for action in self.Q[state]]
        if q:
            maxQ = max(q)
        else:
            maxQ = 0.0
        #maxAction = max(self.Q[state], key=self.Q[state].get)

        #return self.Q[state][maxAction]
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        # Then, for each action available, set the initial Q-value to 0.0
        if self.learning:
            if state not in self.Q:
                self.Q[state] = {}
                for action in self.valid_actions:
                    self.Q[state][action] = 0.0

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie".
        if not self.learning:
            action = random.choice(self.valid_actions)
        else:
            if random.random() < self.epsilon:
                action = random.choice(self.valid_actions)
            else:
                maxQ = self.get_maxQ(state)
                max_actions = []
                for action in self.Q[state]:
                    if self.Q[state][action] == maxQ:
                        max_actions.append(action)
                action = random.choice(max_actions)

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives a reward. This function does not consider future rewards
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        # Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            self.Q[state][action] = self.Q[state][action] * (
                1 - self.alpha) + reward * self.alpha
        return

    def update(self):
        """ The update function is called when a time step is completed in the
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
예제 #38
0
파일: agent.py 프로젝트: jordan258/mlnd
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        # Set any additional class parameters as needed

        # Counter object for keeping track of trial No
        self.count = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Update counter to keep track of trial No
        self.count += 1

        # Select the destination as the new location to route to
        self.planner.route_to(destination)
        # Update alpha and epsilon parameters
        if testing == True:
            self.epsilon = 0
            self.alpha = 0
        else:
            # Update epsilon using decay function
            self.epsilon = 1 / (1 + math.exp(0.05 * (self.count - 150)))

            # Default agent epsilon function
            #self.epsilon = self.epsilon - 0.05

        # Update additional class parameters as needed

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        # Set 'state' as a tuple of relevant data for the agent
        state = (waypoint, inputs)

        return state

    def get_state_key(self, state):
        """Unpacks state variable into a format appropriate for a dictionary key"""

        waypoint, inputs = state
        light = inputs['light']
        left = inputs['left']
        # excludes inputs['right']
        oncoming = inputs['oncoming']
        # Construct key
        state_key = (waypoint, light, left, oncoming)

        return state_key

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        state_key = self.get_state_key(state)

        # Calculate the maximum Q-value of all actions for a given state
        # In the case of a tie, max() chooses the value that came first. Since we are calling
        # max on a dictionary, the elements should be randomly ordered to begin with.

        maxQ = None
        maxQ_action = None

        for pair in self.Q[state_key].items():
            action = pair[0]
            Qvalue = pair[1]
            if maxQ == None:
                maxQ = Qvalue
                maxQ_action = action
            else:
                if maxQ == Qvalue:
                    maxQ, maxQ_action = random.choice([(maxQ, maxQ_action),
                                                       (Qvalue, action)])
                elif Qvalue > maxQ:
                    maxQ = Qvalue
                    maxQ_action = action
                else:
                    continue

        return maxQ_action, maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. 
        	Updates Q-table with new state"""

        if self.learning == True:
            state_key = self.get_state_key(state)

            if state_key not in self.Q.keys():
                self.Q[state_key] = {
                    action: 0.0
                    for action in self.valid_actions
                }

        return None

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()

        # When not learning, choose a random action
        if self.learning == False:
            action = random.choice(self.valid_actions)
        # When learning, choose a random action with 'epsilon' probability
        if self.learning == True:
            if random.random() < self.epsilon:
                action = random.choice(self.valid_actions)
            # Otherwise, choose an action with the highest Q-value for the current state
            else:
                action, maxQ = self.get_maxQ(
                    state)  # See get_maxQ() for explaination of ties

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives a reward. This function does not consider future rewards 
            when conducting learning. """

        state_key = self.get_state_key(state)

        # When learning, implement the value iteration update rule
        if self.learning == True:
            self.Q[state_key][action] = self.Q[state_key][
                action] + self.alpha * (reward - self.Q[state_key][action])

        return None

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
예제 #39
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """ 

    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent, self).__init__(env)     # Set the agent in the evironment 
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning # Whether the agent is expected to learn
        self.Q = dict()          # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon   # Random exploration factor
        self.alpha = alpha       # Learning factor

        # Set any additional class parameters as needed
        self.t = 0
        random.seed(42)

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)
        
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing:
            self.epsilon = 0.0
            self.alpha = 0.0
        else:
            # commented out testing parameters
            self.t += 1.0
            self.epsilon =  float(1/(float(self.t **2)))
            

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint() # The next waypoint 
        inputs = self.env.sense(self)           # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        # Set 'state' as a tuple of relevant data for the agent
        # When learning, check if the state is in the Q-table
        #   If it is not, create a dictionary in the Q-table for the current 'state'
        #   For each action, set the Q-value for the state-action pair to 0

        # helper to create state string
        state = (waypoint, inputs['light'], inputs['left'], inputs['oncoming'])

        return state


    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        # Calculate the maximum Q-value of all actions for a given state
        # preset an initialization value that should be replaced by a more valid Q value in the loop.
        maxQ = -1000.0
        for action in self.Q[state]:
            if maxQ < self.Q[state][action]:
                maxQ = self.Q[state][action]
        return maxQ


    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning:
            self.Q[state] = self.Q.get(state, {None:0.0, 'forward':0.0, 'left':0.0, 'right':0.0})

        return


    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        if not self.learning:
            action = random.choice(self.valid_actions)
        else:
            if self.epsilon > 0.01 and self.epsilon > random.random():
                action = random.choice(self.valid_actions)
            else:
                valid_actions = []
                maxQ = self.get_maxQ(state)
                for act in self.Q[state]:
                    if maxQ == self.Q[state][act]:
                        valid_actions.append(act)
                action = random.choice(valid_actions)
        return action


    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            self.Q[state][action] = self.Q[state][action] + self.alpha*(reward-self.Q[state][action])

        return


    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()          # Get current state
        self.createQ(state)                 # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action) # Receive a reward
        self.learn(state, action, reward)   # Q-learn

        return
예제 #40
0
파일: agent.py 프로젝트: ssuhaila/wip
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """ 

    def __init__(self, env, learning=True, epsilon=0.5, alpha=0.5):
        super(LearningAgent, self).__init__(env)     # Set the agent in the evironment 
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning # Whether the agent is expected to learn
        self.Q = dict()          # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon   # Random exploration factor
        self.alpha = alpha       # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        
        self.t = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)
        
        ########### 
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        
        
        if testing:
            self.epsilon = 0.0
            self.alpha = 0.0
        else:
            #self.epsilon = self.epsilon - 0.05
            #self.t=self.t+1
            #self.epsilon = 1.0/(self.t**2)
            #self.epsilon = math.fabs(math.cos(self.alpha*self.t))
            #self.epsilon = math.exp(-0.5 * self.t)
            
            self.epsilon = self.epsilon * 0.955

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint() # The next waypoint 
        inputs = self.env.sense(self)           # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ########### 
        ## TO DO ##
        ###########
        
        # NOTE : you are not allowed to engineer features outside of the inputs available.
        # Because the aim of this project is to teach Reinforcement Learning, we have placed 
        # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation.
        # With the hand-engineered features, this learning process gets entirely negated.
        
        # Set 'state' as a tuple of relevant data for the agent        
        state = (waypoint, inputs['light'], inputs['oncoming'])
        
        print 'build_state', state
        return state


    def get_maxQ(self, state):
        """ The get_maxQ function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ########### 
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        print 'Access get_maxQ'
        
        maxQ= 0.0
        '''
        for action in self.Q[state]:
            if maxQ < self.Q[state][action]:
                maxQ = self.Q[state][action]'''
        
        #maxQ = max(self.Q[state].iterkeys(), key=(lambda k:self.Q[state][k])) 
        for key,value in self.Q[state].iteritems():
            maxQ = max(maxQ, value)
            
        print 'maxQ value', maxQ
        return maxQ 


    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ########### 
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
       
        
        if not self.Q or state not in self.Q:
            self.Q[state] ={None:0.0, 'left':0.0, 'right':0.0, 'forward':0.0,
                  'green':0.0, 'red':0.0} 
        
        #print 'selfQ list', self.Q
        return


    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ########### 
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie".
        
        print 'self_epsilon in choose_action fx', self.epsilon
        if not self.learning:
            action = random.choice(self.valid_actions)
        else:
            if self.epsilon >= random.random():
                action = random.choice(self.valid_actions)
            else:
                action = self.Q[state].keys()[(self.Q[state].values()).index(self.get_maxQ(state))]
                
                #update the valid_actions with maxQ state
                #valid_actions = [] 
                #maxQ = self.get_maxQ(state)
                #for act in self.Q[state]:
                #    if maxQ == self.Q[state][act]:
                #        valid_actions.append(act)
                #    action = random.choice(valid_actions)
        
                    
        print 'the chosen action:', action
        return action


    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives a reward. This function does not consider future rewards 
            when conducting learning. """

        ########### 
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            self.Q[state][action] = self.alpha + self.alpha*reward   
        
        #print 'selfQ in learn function', self.Q
        return


    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()          # Get current state
        self.createQ(state)                 # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action) # Receive a reward
        self.learn(state, action, reward)   # Q-learn

        return
예제 #41
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        valid_actions = [None, 'forward', 'left', 'right']
        TL_valid_states = [True, False]

        state_desc = {
            'light': TL_valid_states,
            'oncoming': valid_actions,
            'left': valid_actions,
            'right': valid_actions,
            'next_waypoint': ['forward', 'left', 'right']
        }

        self.Q_prev = dict()
        self.gamma = 0.1
        self.alpha = 0.2
        self.p_threshold = 1

        # self.state = TrafficLight

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        valid_actions = [None, 'forward', 'left', 'right']
        # TODO: Update state
        self.state = (inputs['light'], self.next_waypoint)

        # TODO: Select action according to your policy

        Q_actions = []

        for action_i in valid_actions:
            str_state_action = [str(s) for s in self.state]
            str_state_action.append(str(action_i))
            str_state_action = ",".join(str_state_action)
            if len(self.Q_prev) == 0:
                self.Q_prev[str_state_action] = 0
                Q_actions.append(0)
            else:
                if str_state_action in self.Q_prev.keys():
                    Q_actions.append(self.Q_prev[str_state_action])
                else:
                    self.Q_prev[str_state_action] = 0
                    Q_actions.append(0)

        Q_max = max(Q_actions)
        action_max_inds = [
            valid_actions[i] for i in range(len(valid_actions))
            if Q_max == Q_actions[i]
        ]
        action = random.choice(action_max_inds)

        str_state_action_now = [str(s) for s in self.state]
        str_state_action_now.append(str(action))
        str_state_action_now = ",".join(str_state_action_now)

        #action_random = [None, 'forward', 'left', 'right']
        #rand_action = action_random[random.randint(0,3)]
        #action = rand_action

        #print (self.state,action)

        # Execute action and get reward
        reward = self.env.act(self, action)
        self.Q_prev[str_state_action_now] = (1 - self.alpha) * self.Q_prev[
            str_state_action_now] + self.alpha * (reward + self.gamma * Q_max)
예제 #42
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.9):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor
        self.counter = 1

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing:
            self.alpha = 0
            self.epsilon = 0
        else:
            self.epsilon = 0.999**self.counter
            self.counter += 1
        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        # When learning, check if the state is in the Q-table
        #   If it is not, create a dictionary in the Q-table for the current 'state'
        #   For each action, set the Q-value for the state-action pair to 0

        #this reflects the final optimized model
        state = (waypoint, inputs['light'], inputs['left'], inputs['oncoming'])
        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        maxAction = max(self.Q[state], key=lambda x: self.Q[state][x])
        maxQ = self.Q[state][maxAction]
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if state not in self.Q:
            self.Q[state] = {'left': 0, 'right': 0, 'forward': 0, None: 0}

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.old_state = self.state
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = random.choice(self.valid_actions)

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state
        if self.learning and random.random() > self.epsilon:
            maxVal = self.get_maxQ(state)

            #choose randomly if there are more than one max values
            pos_actions = [
                act for act in pos_actions if pos_actions[act] == maxVal
            ]
            action = random.choice(pos_actions)
        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        old_value = self.Q[state][action]
        if self.learning:
            self.Q[state][action] = (
                1 - self.alpha) * old_value + self.alpha * (reward)

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
예제 #43
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self,
                 env,
                 learning=False,
                 epsilon=1.0,
                 alpha=0.5,
                 exploration=1):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor
        self.exploration = exploration  # Exploration factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.t = 0
        random.seed(666)

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing:
            self.epsilon = 0.0
            self.alpha = 0.0
        else:
            #self.epsilon = self.epsilon - 0.05
            self.t += 1.0
            if self.exploration == 1:
                self.epsilon = 1.0 / (self.t**2)
            elif self.exploration == 2:
                self.epsilon = 1.0 / (self.t**2 + self.alpha * self.t)
            elif self.exploration == 3:
                self.epsilon = 1.0 / (self.t**2 - self.alpha * self.t)
            elif self.exploration == 4:
                self.epsilon = math.fabs(math.cos(self.alpha * self.t))
            elif self.exploration == 5:
                self.epsilon = math.fabs(math.cos(
                    self.alpha * self.t)) / (self.t**2)

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent

        # Create state string
        def state_str(state):
            if state is None:
                return 'None'
            else:
                return str(state)

        state = state_str(waypoint) + "_" + inputs['light'] + "_" + state_str(
            inputs['left']) + "_" + state_str(inputs['oncoming'])
        self.createQ(state)  # Create 'state' in Q-table

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        maxQ = max(self.Q[state].values())

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        if (self.learning) and (state not in self.Q):
            self.Q[state] = {action: 0.0 for action in self.valid_actions}

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state
        possible_actions = self.Q[state]
        if not self.learning:
            action = random.choice(possible_actions)
        else:
            choose_using_epsilon = random.random() < 1 - self.epsilon
            if not choose_using_epsilon:
                maxQ = self.get_maxQ(state)
                best_actions = [
                    action for action in possible_actions
                    if self.Q[state][action] == maxQ
                ]
                action = random.choice(best_actions)
            else:
                action = max(possible_actions.items(), key=lambda x: x[1])[0]

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            self.Q[state][action] = self.Q[state][action] + self.alpha * (
                reward - self.Q[state][action])

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
예제 #44
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here

        #Set traffic_light and movement
        traffic_light=["red","green"]
        motion = [None, 'forward', 'left', 'right']
        waypoint,oncoming,left,right=motion,motion,motion,motion


        #Initialize q_table(States are traffic_light,waypoint,oncoming,left and right)
        #Set all features to 0
        self.q_table = {}
        for light in traffic_light:
            for point in waypoint:
                for on in oncoming:
                    for lf in left:
                        for ri in right:
                            self.q_table[(light, point, on, lf,ri)] = {None: 0, 'forward': 0, 'left': 0, 'right': 0}
       # print self.q_table


        self.episode=0
        self.preserve=[]
        self.failure=0

    def reset(self,destination=None,total=0):

        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.total_reward=total


    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)


        old_time=t
        print 'Old_TIME',old_time

        # TODO: Update state
        self.state = (inputs['light'],
                      self.next_waypoint,
                      inputs['oncoming'],
                      inputs['left'],
                      inputs['right'])

        print "The current state is: {}".format(self.state)

        print "t:{}".format(t)

        # TODO: Select action according to your policy
        epsilon=0.1
        #epsilon=1.0
        rand=random.random()
        if rand<epsilon:
            action=random.choice(Environment.valid_actions[0:])
            print "RANDOM ACTION"
        else:
            if max(self.q_table[self.state].values())==0:
                action=random.choice(Environment.valid_actions[0:])
                print self.q_table[self.state]

            else:
                action = max(self.q_table[self.state],
                     key=self.q_table[self.state].get)

            print action

        # Execute action and get reward
        reward = self.env.act(self, action)

        print "REWARD IS:",reward
        self.total_reward+=reward

        print "Total Reward",self.total_reward
예제 #45
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.trial = 1
        self.epsilonDecayVariable = epsilon
        self.alphaDecayVariable = alpha

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        #self.epsilon =self.epsilon -.05
        #?=e^ ?at ,for 0<a<1
        #self.epsilon =math.exp(-self.epsilonDecayVariable*self.trial*1)
        #?=cos(at),for 0<a<1
        #self.epsilon =math.cos(self.epsilonDecayVariable*self.trial)
        trialNum = self.trial
        ParmA = -5E-11
        ParmB = 0.104
        ParmC = 210

        self.epsilon = 1 * math.exp(
            -ParmA * trialNum) / (1 + math.exp(ParmB * (trialNum - ParmC)))
        #self.epsilon=self.epsilon -.05
        #self.epsilon=math.exp(-0.0005*self.trial)*(4+math.cos(.7*self.trial))/5
        #=EXP   (-lambda*A1)*(4+COS(freq*A1))/5

        #decay learning rate
        #self.alpha =.75 #math.exp(-self.alphaDecayVariable*self.trial)
        #self.epsilon =math.pow(self.epsilonDecayVariable,self.trial )

        # Update additional class parameters as needed
        #not sure on additional class parms  -time remaining, success rate,etc.. ?

        self.trial = self.trial + 1
        # If 'testing' is True, set epsilon and alpha to 0
        if testing:
            self.epsilon = 0
            self.alpha = 0

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        # When learning, check if the state is in the Q-table
        #   If it is not, create a dictionary in the Q-table for the current 'state'
        #   For each action, set the Q-value for the state-action pair to 0

        state = (inputs['light'], inputs['oncoming'], inputs['right'],
                 inputs['left'], waypoint)

        if self.learning:
            if state not in self.Q:
                #state is in Q table already
                self.Q[state] = {None: 0, 'left': 0, 'right': 0, 'forward': 0}

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########

        #get the highest Q value from the dict of actions for this state: state parm.
        maxQ = max(self.Q[state].values())
        #maxQ = None

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning:
            if state not in self.Q:
                self.Q[state] = {None: 0, 'left': 0, 'right': 0, 'forward': 0}

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        #action = random.choice(self.valid_actions)

        #print(action)
        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        #get max q value
        maxQ = self.get_maxQ(state)

        #initially choose action with highest Q Value
        dictActions = self.Q[state]
        action = dictActions.keys()[dictActions.values().index(maxQ)]

        if self.learning:
            #if the random choice is within epsilon prob, then choose randomly.
            lDblRand = random.random()
            #is the random less than epsilon ?
            if lDblRand < self.epsilon:
                action = random.choice(self.valid_actions)
        else:
            print('not learning')

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            self.Q[state][action] = (
                1 - self.alpha) * self.Q[state][action] + (reward * self.alpha)

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
예제 #46
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self,
                 env,
                 learning=False,
                 epsilon=1.0,
                 alpha=0.5,
                 epsilon_decay=0.05):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.epsilon_decay = epsilon_decay
        self.step = 0
        self.global_step = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        self.step = 0
        self.nsteps = self.env.get_deadline(self)
        if testing:
            self.epsilon = 0.0
            self.alpha = 0.0
        else:
            #self.epsilon -= self.epsilon_decay
            self.epsilon = math.cos(self.global_step * self.epsilon_decay)
            if self.epsilon < 0.0: self.epsilon = 0.0

            #self.alpha = 0.98 * self.alpha
        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        # When learning, check if the state is in the Q-table
        #   If it is not, create a dictionary in the Q-table for the current 'state'
        #   For each action, set the Q-value for the state-action pair to 0
        state = tuple({
            'waypoint': waypoint,
            'light': inputs['light'],
            'left': 'forward' if inputs['left'] == 'forward' else None,
            'oncoming': inputs['oncoming']
        }.items())

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        maxQs = list()
        if state in self.Q.keys():
            max_value = float('-inf')
            for action in self.Q[state].keys():
                value = self.Q[state][action]
                #print("Action found for state: %s - value: %f" % (str(action), value))
                if value > max_value:
                    max_value = value
                    maxQs = [action]
                elif value == max_value:
                    maxQs.append(action)

        #print("maxQs: %s" % str(maxQs))
        if (len(maxQs) == 0): return None
        return maxQs[0] if (len(maxQs) == 1) else random.choice(maxQs)

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning and state not in self.Q.keys():
            self.Q[state] = {
                'forward': 0.0,
                'left': 0.0,
                'right': 0.0,
                None: 0.0
            }

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        self.step += 1
        self.global_step += 1
        #   Otherwise, choose an action with the highest Q-value for the current state
        if self.learning == True:
            choice = np.random.choice((0, 1),
                                      p=(self.epsilon, 1 - self.epsilon))
            if choice == 0:
                print(
                    "### Explore (step %d, nsteps %d, epsilon %f, epsilon*nsteps %f)"
                    % (self.step, self.nsteps, self.epsilon,
                       self.epsilon * self.nsteps))
                action = random.choice(self.valid_actions)
            else:
                print(
                    "@@@ Exploit (step %d, nsteps %d, epsilon %f, epsilon*nsteps %f)"
                    % (self.step, self.nsteps, self.epsilon,
                       self.epsilon * self.nsteps))
                action = self.get_maxQ(state)
        else:
            action = random.choice(self.valid_actions)

        print("self.learning: %s" % str(self.learning))
        print("Q-learning database size: %d" % len(self.Q))
        print("self.valid_actions: %s" % str(self.valid_actions))
        print("action: %s" % str(action))
        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')

        if self.learning == False:
            return

        if state in self.Q.keys():
            old_value = self.Q[state][action]
            self.Q[state][action] = old_value + self.alpha * (reward -
                                                              old_value)
            #print("Action %s got updated to %f\n" % (action, self.Q[state][action]))

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
예제 #47
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """ 

    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent, self).__init__(env)     # Set the agent in the evironment 
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning # Whether the agent is expected to learn
        self.Q = dict()          # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon   # Random exploration factor
        self.alpha = alpha       # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        #self.trial = 0


    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)
        
        ########### 
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        global trial
        trial = trial + 1
        
        if testing == False:
            self.epsilon -= 0.05
            #self.epsilon = math.pow(0.97, trial)
        elif testing == True:
            self.epsilon = 0
            self.alpha = 0
            print self.epsilon
        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint() # The next waypoint 
        inputs = self.env.sense(self)           # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ########### 
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent        
        #state = (waypoint, inputs)
        state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left'])

        return state


    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ########### 
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        maxQ = max(self.Q[state].values())

        return maxQ 


    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ########### 
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        
        if self.learning:
            if self.state not in self.Q:
                self.Q[state] = {None : 0.0, 'left': 0.0, 'right' : 0.0, 'forward' : 0.0}
        
        return


    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        #action = 'forward'
        #action = None
        action_list = [None, 'left', 'right', 'forward']
        #action = random.choice(action_list)
        
        ########### 
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state
        if self.learning == False:
            action = random.choice(action_list)
        elif self.learning == True:
            if random.random() <= self.epsilon:
                action = random.choice(action_list)
                #print action
                #print self.Q
            else:
                #print self.Q
                self.createQ(self.state)
                #print self.Q[self.state]
                #print self.get_maxQ(self.state)
                action = self.get_action(self.Q[self.state], self.get_maxQ(self.state))
                print action
        return action
    
    
    def get_action(self, dictionary, value):
        for k, v in dictionary.iteritems():
            if v == value:
                return k


    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ########### 
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        self.createQ(self.state)
        #self.Q[state][action] = (1- self.alpha) * self.Q[state][action] + self.alpha * reward
        self.Q[state][action] = reward
        return


    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()          # Get current state
        self.createQ(state)                 # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action) # Receive a reward
        self.learn(state, action, reward)   # Q-learn

        return
예제 #48
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.t = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing:
            self.epsilon = 0
            self.alpha = 0
        else:
            self.t += 1
            # self.epsilon-=0.05
            # self.epsilon=(self.alpha**self.t)
            # self.epsilon=1.0/(self.t**2)
            # self.epsilon=math.exp(-(self.alpha * self.t))
            self.epsilon = math.fabs(math.cos(self.alpha * self.t))
            #self.epsilon = math.fabs(math.cos(self.alpha*self.t))
            # self.epsilon -= self.epsilon / 100
        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the
            environment. The next waypoint, the intersection inputs, and the deadline
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########

        # NOTE : you are not allowed to engineer eatures outside of the inputs available.
        # Because the aim of this project is to teach Reinforcement Learning, we have placed
        # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation.
        # With the hand-engineered features, this learning process gets entirely negated.

        # Set 'state' as a tuple of relevant data for the agent
        # state = (waypoint,
        #          inputs['left'],
        #          inputs['light'],
        #          inputs['oncoming'])
        # stringify = lambda s: 'None' if s is None else str(s)
        # state = [stringify(s) for s in state]
        state = (waypoint, inputs['light'], inputs['left'], inputs['right'],
                 inputs['oncoming'])

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        # maxQ = max(self.Q[state].values())
        #
        # return maxQ
        if state in self.Q:
            # Get all the Action on current state
            state_actions = self.Q[state]

            # Get all the values of actions at current state
            state_actions_val = state_actions.values()

            # Take maximum value of action value on current state
            maxQ = max(state_actions_val)
            print('MaxWQ', maxQ)
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        # Set Default value of action
        action_initial_val = 0.0

        # If learning is true
        if self.learning is True:
            if state not in self.Q:
                # self.Q = dict()
                # Adding all the validate action at current state
                self.Q[state] = {
                    action: action_initial_val
                    for action in self.valid_actions
                }

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie".
        if self.learning is not True:
            action = random.choice(self.valid_actions)
        elif self.learning is True:
            if self.epsilon > random.random():
                print('Learning True')
                action = random.choice(self.valid_actions)
            else:
                # print(state)
                # Max Q-value at current state
                maxQ_Val = self.get_maxQ(state)
                # print('maxQ_val', maxQ_Val)

                # Actions list at current state
                action_at_state = self.Q[state]

                # print('action_at_state', action_at_state)
                max_Q_actions = [
                    val[0] for val in action_at_state.items()
                    if val[1] == maxQ_Val
                ]

                # print('max_Q_actions', max_Q_actions)
                action = random.choice(max_Q_actions)
                # print('max_Q_actions  action', action)
        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives a reward. This function does not consider future rewards
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning is True:
            self.Q[state][action] = (
                1 - self.alpha) * self.Q[state][action] + (self.alpha * reward)
        return

    def update(self):
        """ The update function is called when a time step is completed in the
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn
        return
예제 #49
0
파일: agent.py 프로젝트: sjbaek/MLND_P4
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.serial = 0

        # initialize Q-matrix
        self.Qmat = {}

        # initialize other variables
        # Basic Q-learning
        #self.alpha = 0.8
        #self.gamma = 0.5
        #self.epsilon = 0.0

        # Enhanced Q-learning

        #self.alpha = 0.8
        self.alpha = 0.4
        #self.gamma = 1.0
        #self.gamma = 0.5
        self.gamma = 0.2

        self.epsilon = 1.0
        #self.success = 0.

        # This function reaches at the very beginning of the script

        # Open a csv file to keep track of the agent's success rate
        with open("SimulationResults_Enhanced.csv", "a") as outputFile:
            #with open("SimulationResults.csv","a") as outputFile:
            outputFile.write("\n{},{},{},{},".format("Gamma", self.gamma,
                                                     "Alpha", self.alpha))

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # This function reaches at every start of a new trial

        # TODO: Prepare for a new trip; reset any variables here, if required
        #self.serial = 0
        self.serial += 1

    def update(self, t):
        # ----------------------------- check information from previous time step

        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state

        #printOK = True
        printOK = False

        self.state = (inputs['light'], self.next_waypoint)

        if printOK: print "1. current state is:", self.state

        # TODO: Select action according to your policy

        epsilon = (self.epsilon / (self.epsilon + self.serial))

        if self.state in self.Qmat.keys():
            if printOK:
                print "   1.a ----------------------------------{} exists".format(
                    self.state)
            action_key_value = self.Qmat[
                self.state]  # this shows key/value of selected state in Qmat

            if printOK:
                print "        Q table for this state: ", action_key_value

            # when multiple maxima exist - random selection
            actions_max = {actions:action_value for actions, action_value \
                        in action_key_value.items() if action_value == max(action_key_value.values())}

            action = random.choice(actions_max.keys())
            if printOK: print "      available actions={}".format(actions_max)
            if printOK: print "         learned_action={}".format(action)

            # "exploring" takes place randomly
            if random.random() < epsilon:
                action = random.choice([None, 'forward', 'left', 'right'])
                if printOK:
                    print "         learned_action={} (random)".format(action)
        else:
            if printOK:
                print "   1.b ----------------------------------{} NEW state".format(
                    self.state)
            self.Qmat[self.state] = {
                None: 0.0,
                'forward': 0.0,
                'left': 0.0,
                'right': 0.0
            }

            # random action, when landed on a 'new' state
            action = random.choice([None, 'forward', 'left', 'right'])
            if printOK: print "        random action={}".format(action)

        # Execute action and get reward
        reward = self.env.act(self, action)
        if printOK: print "   Reward for this action={}".format(reward)

        # TODO: Learn policy based on state, action, reward

        # New state definition seems incorrect
        # FIX: New definition of next state: state and waypoint after action was determined above!
        inputs_new = self.env.sense(self)

        # This is the key; next_waypoint should be called twice: once above and once here!
        self.next_waypoint = self.planner.next_waypoint()
        new_state = (inputs_new['light'], self.next_waypoint)
        if printOK: print "2. next state is:", new_state
        if new_state in self.Qmat.keys():
            Q_next = max(self.Qmat[new_state].values())
            if printOK:
                print "   2.a -----------------------------Q table for NEXT state: ", self.Qmat[
                    new_state]
        else:
            if printOK:
                print "   2.a -----------------------------Q table for NEXT state: NOT FOUND"
            Q_next = 0.

        if printOK:
            print "   2.b -----------------------------Q[new state]=", Q_next

        Qhat = (1.0 - self.alpha) * self.Qmat[
            self.state][action] + self.alpha * (reward + self.gamma * Q_next)

        self.Qmat[self.state][action] = Qhat
        # print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]

        if printOK: print "3. CONCLUSION"
        if printOK:
            print "   action is {}, state is {}".format(action, self.state)
        if printOK:
            print "   Q table for (s,a) is: {}".format(self.Qmat[self.state])
예제 #50
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=True, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.t = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        def decay_exponential(a):
            self.epsilon = a**self.t

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing == True:
            self.epsilon = 0.0
            self.alpha = 0.0
        else:
            self.t += 1
            decay_exponential(0.998)

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########

        # NOTE : you are not allowed to engineer features outside of the inputs available.
        # Because the aim of this project is to teach Reinforcement Learning, we have placed
        # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation.
        # With the hand-engineered features, this learning process gets entirely negated.

        # Set 'state' as a tuple of relevant data for the agent
        state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left'],
                 inputs['right'])

        return state

    def get_maxQ(self, state):
        """ The get_maxQ function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        maxQ = max(self.Q[state].values())

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        if self.learning:
            if state not in self.Q:
                self.Q[state] = {
                    None: 0.0,
                    'forward': 0.0,
                    'left': 0.0,
                    'right': 0.0
                }  # Init default Q Score.
        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie".
        if not self.learning:
            action = random.choice(self.valid_actions)
        else:
            if self.epsilon > random.random():
                action = random.choice(self.valid_actions)
            else:
                max_Q = max(self.Q[state].values())
                max_candidate = [
                    x for x in self.Q[state] if self.Q[state][x] == max_Q
                ]
                action = random.choice(max_candidate)

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives a reward. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            try:
                self.Q[state][action] = (1 - self.alpha) * (
                    self.Q[state][action]) + self.alpha * reward
            except KeyError:
                pass

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
예제 #51
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.75):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions
        self.action_list = ['None', 'left', 'right', 'forward']

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.n_trial = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        self.n_trial += 1
        #self.alpha = 1.0 / self.n_trial**2
        #self.epsilon = 1.0/ (self.n_trial**2)
        self.epsilon = 1.0 / self.n_trial
        #self.epsilon = 1.0/ ((self.n_trial+1.0)/2.0)
        #self.epsilon = 1.0 - (self.n_trial**1.5/50.0)
        #self.epsilon -= .05
        if testing == True:
            self.epsilon = 0
            self.alpha = 0

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        # When learning, check if the state is in the Q-table
        #   If it is not, create a dictionary in the Q-table for the current 'state'
        #   For each action, set the Q-value for the state-action pair to 0

        state = (inputs['oncoming'], inputs['left'] == 'forward',
                 inputs['light'], waypoint)
        if self.learning == True:
            if str(state) not in self.Q.keys():
                self.Q[str(state)] = dict()
                for action in self.action_list:
                    self.Q[str(state)].update({action: 0.0})
                print(self.Q)

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        # I found this code on stack overflow to get the max key
        maxQ = self.Q[str(state)][str(
            max(self.Q[str(state)], key=lambda key: self.Q[str(state)][key]))]

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning == True:
            if str(self.state) not in self.Q.keys():
                self.Q[str(state)] = dict()
                for action in self.action_list:
                    self.Q[str(state)].update({action: 0.0})

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state
        #random action
        action_list = []
        if self.learning == False:
            action = self.valid_actions[random.randint(0, 3)]
        elif self.epsilon > random.random() and self.epsilon > 0:
            action = self.valid_actions[random.randint(0, 3)]
        else:
            for i in self.Q[str(state)]:
                if self.Q[str(state)][i] == self.get_maxQ(state):
                    action_list.append(str(i))
            if len(action_list) > 1:
                action = action_list[random.randint(0, len(action_list) - 1)]
            else:
                action = action_list[0]
            if action == 'None':
                action = None

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning == True:
            currentQ = self.Q[str(state)][str(action)]
            self.Q[str(state)][str(
                action)] = reward * self.alpha + currentQ * (1 - self.alpha)
            #self.alpha = self.alpha**1.5

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
예제 #52
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env, **kwargs):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        add_total = 0
        add_total = False
        self.success = 0
        self.total = 0
        self.counter = 0
        self.epsilon_reset_counter = 0
        self.trial_counter = 0.0
        self.min_epsilon = 0.001
        self.eps_freq = 1.0
        self.filled_cell_count = 0
        self.total_cell_count = 0
        self.updated_func_counter = 0
        global stats_df_counter
        global stats_df

        for key, value in kwargs.iteritems():
            print "%s = %s" % (key, value)
            if key == 'alp':
                self.alpha = value
            elif key == 'gma':
                self.gamma = value
            elif key == 'eps':
                self.epsl = value
        self.epsilon = self.epsl
        print "epsilon: ", self.epsilon
        self.qt = QTable(self.alpha, self.gamma)
        print '-' * 80

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        totalTime = self.env.get_deadline(self)
        self.qt.printVal(totalTime)
        self.trial_counter += 1.0
        if self.epsilon > self.min_epsilon:
            self.epsilon = (5.0 * self.epsl) / self.trial_counter
            self.eps_freq = math.ceil(1.0 / self.epsilon)
            print "self.epsilon:", self.epsilon, ", self.eps_freq: ", self.eps_freq, "\n"

    def update(self, t):
        global stats_df
        global stats_df_counter
        self.counter += 1
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        current_state = self.env.sense(self)
        self.state = current_state

        deadline = self.env.get_deadline(self)
        # TODO: Update state

        # TODO: Select action according to your policy

        #action = random.choice([None, 'forward', 'left', 'right'])
        #if self.total > 0 and self.total % self.epsilon_freq == 0.0:
        #    print "simulated annealing at ", self.total
        #    action = random.choice([None, 'forward', 'left', 'right'])
        #else:
        if self.epsilon > self.min_epsilon and deadline != 0 and deadline != self.eps_freq and math.floor(
                deadline % self.eps_freq) == 0.0:
            #self.epsilon_reset_counter += 1
            action = random.choice([None, 'forward', 'left', 'right'])
            print "annealing now.", "self.epsilon:", self.epsilon, ", action: ", action, ", deadline:", deadline

        else:
            #print "self.counter: ", self.counter, ", multiplier:", (self.counter * self.epsilon)
            action = self.qt.get_next_action(self.next_waypoint, deadline,
                                             current_state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        add_total = False
        if deadline == 0:
            add_total = True
        if reward > 10:
            self.success += 1
            add_total = True
        if add_total:
            self.total += 1
            print("success: {} / {}".format(self.success, self.total))

        if self.total == 100:

            for item, frame in self.qt.qtable.iteritems():
                for item2, frame2 in frame.iteritems():
                    for item3, frame3 in frame2.iteritems():
                        for item4, frame4 in frame3.iteritems():
                            self.total_cell_count += 1
                            #print("f4:", frame4)
                            if frame4 != 0.0:
                                #print "\n"
                                self.printNav(item2)
                                self.printTraffic(item3)
                                self.printTrafficLight(item4)
                                self.printAction(item)
                                print "Q-Val: {0:.5f}".format(frame4)
                                self.filled_cell_count += 1
            print '-' * 80
            print "updated cells: ", self.filled_cell_count, ", self.total_cell_count:", self.total_cell_count, ", updated_func_counter:", self.updated_func_counter
            print "self.alpha:", self.alpha, "self.gamma:", self.gamma, ", self.epsilon:", self.epsl, ", success:", self.success, " in steps: ", deadline
            stats_df.loc[stats_df_counter] = [
                self.alpha, self.gamma, self.epsl, self.success, deadline
            ]
            stats_df_counter += 1
            print '_' * 80
            #    print '_'*20
        # TODO: Learn policy based on state, action, reward
        next_state_value = self.env.sense(self)
        next_state_deadline = self.env.get_deadline(self)
        next_state_waypoint = self.planner.next_waypoint()
        self.qt.update(self.next_waypoint, deadline, current_state, action,
                       reward, next_state_value, next_state_waypoint, self,
                       self.env)
        self.updated_func_counter += 1

    def printAction(self, code):
        print '|',
        if code == 'AN':
            print "Action: None",
        elif code == 'BF':
            print "Action: Forward",
        elif code == 'CR':
            print "Action: Right",
        elif code == 'DL':
            print "Action: Left",
        print '|',

    def printNav(self, code):
        print '|',
        if code == 0:
            print "Nav: None",
        elif code == 1:
            print "Nav: Forward",
        elif code == 2:
            print "Nav: Right",
        elif code == 3:
            print "Nav: Left",

    def printTraffic(self, code):
        left_mask = 0b000011
        right_mask = 0b001100
        oncoming_mask = 0b110000

        left_filtered = code & left_mask
        right_filtered = code & right_mask
        oncoming_filtered = code & oncoming_mask

        print '| Traffic state: ',
        if left_filtered == 0:
            print "Left: None",
        elif left_filtered == 1:
            print "Left: Forward",
        elif left_filtered == 2:
            print "Left: Right",
        elif left_filtered == 3:
            print "Left: Left",
        print '-+-',

        if right_filtered == 0:
            print "Right: None",
        elif right_filtered == 4:
            print "Right: Forward",
        elif right_filtered == 8:
            print "Right: Right",
        elif right_filtered == 12:
            print "Right: Left",
        print '-+-',

        if oncoming_filtered == 0:
            print "Oncoming: None",
        elif oncoming_filtered == 16:
            print "Oncoming: Forward",
        elif oncoming_filtered == 32:
            print "Oncoming: Right",
        elif oncoming_filtered == 48:
            print "Oncoming: Left",

    def printTrafficLight(self, code):
        print '| ',
        if code == 0:
            print "Light: Red",
        else:
            print "Light: Green",
예제 #53
0
class QLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world using Q learning"""

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'yellow'  # OverflowError(" error")ide color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        ##initialize q table here
        self.Q = dict()
        self.alpha    = 1.0
        self.epsilon  = 0.0 
        self.gamma    = 0.05
        self.discount = self.gamma
        self.state = None

    def flipCoin(self, p ):
        r = random.random()
        return r < p

    def reset(self, destination=None):
        self.planner.route_to(destination)

    def getQValue(self, state, action):
        return self.Q.get((state, action), 1)  

    def getValue(self, state):

        for action in Environment.valid_actions:
            if(self.getQValue(state, action) > 0):
                bestQValue = self.getQValue(state, action)

        return bestQValue

    def getPolicy(self, state):

        bestQValue = 0
        for action in Environment.valid_actions:
            if(self.getQValue(state, action) > bestQValue):
                bestQValue = self.getQValue(state, action)
                bestAction = action
            if(self.getQValue(state, action) == bestQValue):
                random.choice(Environment.valid_actions)
                bestQValue = self.getQValue(state, action)
                bestAction = action
        return bestAction

    def getAction(self, state):
        action = None
        if (self.flipCoin(self.epsilon)):
            action = random.choice(Environment.valid_actions)
        else:
            action = self.getPolicy(state)
        return action

    def qTable(self, state, action, nextState, reward):
    
        if((state, action) not in self.Q): 
            self.Q[(state, action)] = 1
        else:
            self.Q[(state, action)] = self.Q[(state, action)] + self.alpha*(reward + self.discount*self.getValue(nextState) - self.Q[(state, action)]) 


    def update(self, t):
       
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        deadline = self.env.get_deadline(self)

        inputs = self.env.sense(self)
        inputs['waypoint'] = self.next_waypoint
        self.state = tuple(sorted(inputs.items()))

        action = self.getAction(self.state)

        reward = self.env.act(self, action)

        self.prevAction = action
        self.prevState = self.state
        self.prevReward = reward

        if self.prevReward!= None:
            self.qTable(self.prevState,self.prevAction,self.state,self.prevReward)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)
예제 #54
0
class LearningAgent(Agent):
    """An agent that learns how to drive in the smartcab world."""
    def __init__(self,
                 env,
                 init_value=0,
                 gamma=0.90,
                 alpha=0.20,
                 epsilon=0.10,
                 discount_deadline=False,
                 history=0):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override default color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint

        ## Initialize the Q-function as a dictionary (state) of dictionaries (actions)
        self.q_function = {}
        self.history = history
        if self.history > 0:
            self.init_q_function()

        ## Initial value of any (state, action) tuple is an arbitrary random number
        self.init_value = init_value
        ## Discount factor gamma: 0 (myopic) vs 1 (long-term optimal)
        self.gamma = gamma
        self.discount_deadline = discount_deadline

        ## Learning rate alpha: 0 (no learning) vs 1 (consider only most recent information)
        ## NOTE: Normally, alpha decreases over time: for example, alpha = 1 / t
        self.alpha = alpha
        ## Parameter of the epsilon-greedy action selection strategy
        ## NOTE: Normally, epsilon should also be decayed by the number of trials
        self.epsilon = epsilon

        ## The trial number
        self.trial = 1
        ## The cumulative reward
        self.cumulative_reward = 0

    def get_q_function(self):
        return self.q_function

    def set_params(self,
                   init_value=0,
                   gamma=0.90,
                   alpha=0.20,
                   epsilon=0.10,
                   discount_deadline=False,
                   history=0):
        self.init_value = init_value
        self.gamma = gamma
        self.alpha = alpha
        self.epsilon = epsilon
        self.discount_deadline = discount_deadline
        self.history = history

    def reset(self, destination=None):
        self.planner.route_to(destination)
        self.env.set_trial_number(self.trial)
        self.trial += 1
        ## Decay the epsilon parameter
        #         self.epsilon = self.epsilon / math.sqrt(self.trial)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.cumulative_reward = 0

    def init_q_function(self):
        '''
        Initializes the Q-tables with previously learned results.
        '''
        csv_files = glob.glob('../q_tables/*.csv')
        history_counter = 0
        state_counter = {}
        for csv_file in csv_files:
            q_df = pd.read_csv(csv_file, sep=',', header=None)
            ## Assign the header
            header = ['state'
                      ] + [str(action) for action in self.env.valid_actions]
            q_df.columns = header
            for i in xrange(q_df.shape[0]):
                state = q_df.ix[i]['state']
                state = state[1:-1]
                state_tuple = literal_eval(state)
                if state_tuple in self.q_function:
                    action_function = self.q_function[state_tuple]
                    for action in self.env.valid_actions:
                        current_value = action_function[action]
                        action_function[action] = current_value + q_df.ix[i][
                            str(action)]
                    self.q_function[state_tuple] = action_function
                    ## Update the frequency counter
                    counter = state_counter[state_tuple]
                    state_counter[state_tuple] = counter + 1
                else:
                    action_function = {}
                    for action in self.env.valid_actions:
                        action_function[action] = q_df.ix[i][str(action)]
                    self.q_function[state_tuple] = action_function
                    state_counter[state_tuple] = 1
            history_counter += 1
            if history_counter >= self.history:
                break

        ## Average all action values
        for state in state_counter.keys():
            count = state_counter[state]
            action_function = self.q_function[state]
            for action in self.env.valid_actions:
                current_value = action_function[action]
                action_function[action] = current_value / count
            self.q_function[state] = action_function

    def select_action(self, state=None, is_current=True, t=1):
        '''
        Implements the epsilon-greedy action selection that selects the best-valued action in this state
        (if is_current) with probability (1 - epsilon) and a random action with probability epsilon.
        't' is the current time step that can be used to modify epsilon.
        '''
        if state in self.q_function:
            ## Find the action that has the highest value
            action_function = self.q_function[state]
            q_action = max(action_function, key=action_function.get)
            if is_current:
                ## Generate a random action
                rand_action = random.choice(self.env.valid_actions)
                ## Select action using epsilon-greedy heuristic
                rand_num = random.random()
                action = q_action if rand_num <= (
                    1 - self.epsilon) else rand_action
            else:
                action = q_action
        else:
            ## Initialize <state, action> pairs and select random action
            action_function = {}
            for action in self.env.valid_actions:
                action_function[action] = self.init_value
            self.q_function[state] = action_function
            action = random.choice(self.env.valid_actions)

        return action

    def update(self, t):
        '''
        At each time step t, the agent:
        - Is given the next waypoint (relative to its current location and direction)
        - Senses the intersection state (traffic light and presence of other vehicles)
        - Gets the current deadline value (time remaining)
        '''
        ## The destination trying to reach
        #         destination = self.env.agent_states[self]['destination']

        ## Observe the current state variables
        ## (1) Traffic variables
        inputs = self.env.sense(self)
        light = inputs['light']
        oncoming = inputs['oncoming']
        left = inputs['left']
        ## (2) Direction variables
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator

        ## Update the current observed state
        self.state = (light, oncoming, left, self.next_waypoint)
        current_state = self.state  # save this for future use

        ## Select the current action
        action = self.select_action(state=current_state, is_current=True, t=t)
        ## Execute action, get reward and new state
        reward = self.env.act(self, action)
        self.cumulative_reward += reward
        self.env.set_cumulative_reward(self.cumulative_reward)

        ## Retrieve the current Q-value
        current_q = self.q_function[self.state][action]
        #         print 'Current Q = ' + str(current_q)

        ## Update the state variables after action
        ## (1) Traffic variables
        new_inputs = self.env.sense(self)
        light = new_inputs['light']
        oncoming = new_inputs['oncoming']
        left = new_inputs['left']
        ## (2) Direction variables
        self.next_waypoint = self.planner.next_waypoint()

        if self.discount_deadline:
            deadline = self.env.get_deadline(self)
            if t == 1:
                ## TODO: Set this as an input param
                self.gamma = 1 - float(4) / deadline

        ## Update the new state, which is a tuple of state variables
        self.state = (light, oncoming, left, self.next_waypoint)
        ## Get the best q_action in the new state
        new_action = self.select_action(state=self.state,
                                        is_current=False,
                                        t=t)
        ## Get the new Q_value
        new_q = self.q_function[self.state][new_action]

        ## Update the Q-function
        #         current_alpha = 1 / math.sqrt(t+1)
        current_alpha = self.alpha
        self.q_function[current_state][action] = (
            1 - current_alpha) * current_q + current_alpha * (
                reward + self.gamma * new_q)
        #         print 'Updated Q = ' + str(self.q_function[current_state][action])
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(
            deadline, inputs, action, reward)  # [debug]
예제 #55
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.T = 0.0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing:

            self.epsilon = 0.0
            self.alpha = 0.0
        else:
            self.T = self.T + 1.0
            #self.epsilon = self.epsilon - 0.05;
            #self.epsilon = 1.0 / ( (self.T * self.T) + (self.alpha * self.T))
            #self.epsilon = self.epsilon = 1.0 / (self.T **2)
            self.epsilon = math.fabs(math.cos(self.alpha * self.T))

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        state = {
            "light": inputs["light"],
            "oncoming": inputs["oncoming"],
            "left": inputs["left"],
            "direction": waypoint
        }

        return state

    def get_maxQ(self, hash_key):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        maxQ = -1000.0
        for action in self.Q[hash_key]:
            if maxQ < self.Q[hash_key][action]:
                maxQ = self.Q[hash_key][action]

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        state_key = self.hash_key(state)
        print "state: {}".format(state_key)
        if self.learning:
            self.Q[state_key] = self.Q.get(state_key, {
                None: 0.0,
                'forward': 0.0,
                'left': 0.0,
                'right': 0.0
            })
        #print self.Q
        return state

        return

    def hash_key(self, state):
        """ Method to create the key string object based on the state passed """
        light = state['light']
        oncoming = state['oncoming']
        left = state['left']
        direction = state['direction']

        if light is None:
            light = 'None'
        if oncoming is None:
            oncoming = 'None'
        if left is None:
            left = 'None'
        if direction is None:
            direction = 'None'

        return str(light + '_' + oncoming + '_' + left + '_' + direction)

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state
        if not self.learning:
            actionSelected = random.choice(self.valid_actions)
            print "Not Learning....setting action: {}".format(
                random.choice(self.valid_actions))
            action = actionSelected
        else:
            if self.epsilon > 0.01 and self.epsilon > random.random():
                action = random.choice(self.valid_actions)
            else:
                possible_actions = []
                hash_key = self.hash_key(state)
                maxQ = self.get_maxQ(hash_key)
                for action in self.Q[hash_key]:
                    if maxQ == self.Q[hash_key][action]:
                        possible_actions.append(action)
                action = random.choice(possible_actions)

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        hash_key = self.hash_key(state)
        if self.learning:
            self.Q[hash_key][action] = self.Q[hash_key][
                action] + self.alpha * (reward - self.Q[hash_key][action])
            #self.Q[hash_key][action] = (1 - self.alpha) * self.Q[hash_key][action] + (reward * self.alpha)

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        # Set any additional class parameters as needed
        self.t = 0
        self.state_def = [
            ['left', 'right', 'forward'],  #waypoint
            ['red', 'green'],  #light
            ['left', 'right', 'forward', None],  #vehicleleft
            ['left', 'right', 'forward', None],  #vehicleright
            ['left', 'right', 'forward', None]  #vehicleoncoming
        ]

        self.template_q = dict((k, 0.0) for k in self.valid_actions)

        for state_tuple in itertools.product(*self.state_def):
            self.Q[state_tuple] = self.template_q.copy()

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        if testing:
            self.epsilon = 0
            self.alpha = 0
        else:
            # Use negative exponential e^(-at) decay function
            self.epsilon = math.exp(-self.alpha * self.t)
            self.t += 1

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        # Set 'state' as a tuple of relevant data for the agent.  Ensure it is the same order as
        # in the class initializer
        state = (waypoint, inputs['light'], inputs['left'], inputs['right'],
                 inputs['oncoming'])

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        # Calculate the maximum Q-value of all actions for a given state

        maxQ = max(self.Q[state].values())
        maxQ_actions = []
        for action, Q in self.Q[state].items():
            if Q == maxQ:
                maxQ_actions.append(action)

        return maxQ, maxQ_actions

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        # When learning, check if the 'state' is not in the Q-table
        if not self.learning:
            return

        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        # Note: This is already performed in class constructor, but replicated here too

        if not state in self.Q:
            self.Q[state] = self.template_q.copy()

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state

        if not self.learning or random.random() <= self.epsilon:
            action = random.choice(self.valid_actions)
        else:
            maxQ, maxQ_actions = self.get_maxQ(state)
            action = random.choice(maxQ_actions)

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            self.Q[state][action] = reward * self.alpha + self.Q[state][
                action] * (1 - self.alpha)

        #  I've decided to keep alpha constant, so it is not altered

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
예제 #57
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.t = 0
        self.a = 0.005

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        self.epsilon = math.cos((self.a) * self.t)
        # self.alpha = math.exp((-self.a) * self.t)
        # self.epsilon = 1/(self.t ** 2)
        # self.epsilon -= 0.05
        self.t = self.t + 1
        if testing == True:
            self.epsilon = 0
            self.alpha = 0

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the
            environment. The next waypoint, the intersection inputs, and the deadline
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        state = (
            waypoint,
            inputs['light'],
            # inputs['left'],
            inputs['left'],
            inputs['oncoming'])
        # When learning, check if the state is in the Q-table
        #   If it is not, create a dictionary in the Q-table for the current 'state'
        #   For each action, set the Q-value for the state-action pair to 0
        if self.learning == True:
            try:
                self.Q[state]
            except KeyError:
                self.Q[state] = dict()
                for action in self.env.valid_actions:
                    self.Q[state][action] = 0

        # state = None

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        maxQ = None
        for action in self.Q[state]:
            if self.Q[state][action] > maxQ:
                maxQ = self.Q[state][action]
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning == True:
            try:
                self.Q[state]
            except KeyError:
                self.Q[state] = dict()
                for action in self.env.valid_actions:
                    self.Q[state][action] = 0.0
        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state
        if self.learning == False:
            action = random.choice(self.env.valid_actions)
        else:
            if random.random() < self.epsilon:
                action = random.choice(self.env.valid_actions)
            else:
                action_choice = []
                for actions, value in self.Q[state].items():
                    if value == self.get_maxQ(state):
                        action_choice.append(actions)
                action = random.choice(action_choice)
        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning == True:
            #current_value = self.Q[state][action]
            #learned_value = reward + self.get_maxQ(state)
            #new_q_value = learned_value + (self.alpha * (learned_value - current_value))
            self.Q[state][action] += self.alpha * (reward -
                                                   self.Q[state][action])
        return

    def update(self):
        """ The update function is called when a time step is completed in the
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
예제 #58
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=True, epsilon=0.5, alpha=0.5, gamma=0):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor
        self.gamma = gamma  # Discount factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Create a series of waypoints
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        # When learning, check if the state is in the Q-table
        #   If it is not, create a dictionary in the Q-table for the current 'state'
        #   For each action, set the Q-value for the state-action pair to 0

        state = None

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maxmimum Q-value of all actions for a given state

        maxQ = None

        return maxQ

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        return action

    def learn(self, state, action, reward, new_state):
        """ The learn function is called after the agent completes an action and
            receives an award. 'new_state' is the state the smartcab arrives at
            once completing the action. This is for calculating future rewards. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use both the learning rate 'alpha', and the discount factor, 'gamma'

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        # Update the agent based on the functions built above.
        state = self.build_state()  # Build the agent pre-action state
        action = self.choose_action(
            state)  # Choose an action based on the agent state
        reward = self.env.act(self,
                              action)  # Receive a reward based on the action
        new_state = self.build_state()  # Build the agent's post-action state
        self.learn(state, action, reward,
                   new_state)  # Run the Q-Learning algorithm

        return
예제 #59
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing == True:
            self.epsilon = 0
            self.alpha = 0
        else:
            self.epsilon = self.epsilon - 0.05
        # checks it is not negative
        if self.epsilon <= 0:
            self.epsilon = 0

        return self.epsilon

    def build_state(self):
        """ The build_state function is called when the agent r[equests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########

        # NOTE : you are not allowed to engineer eatures outside of the inputs available.
        # Because the aim of this project is to teach Reinforcement Learning, we have placed
        # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation.
        # With the hand-engineered features, this learning process gets entirely negated.

        # Set 'state' as a tuple of relevant data for the agent
        state = (waypoint, inputs['light'], inputs['oncoming'])

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        maxQ = max(self.Q[state], key=lambda key: self.Q[state][key])

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """
        #    valid_actions = [None, 'forward', 'left', 'right']

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.Q.has_key(state) == False:
            self.Q[state] = {None: 0, 'forward': 0, 'left': 0, 'right': 0}
        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie".
        # To remember where it came from... https://docs.python.org/2/library/random.html
        # https://junedmunshi.wordpress.com/tag/epsilon-policy/
        if self.learning == False:
            newmove = random.choice(self.valid_actions)
        else:
            if random.random() < self.epsilon:
                newmove = random.choice(self.valid_actions)
            else:
                newmove = self.get_maxQ(state)
                print("New MAX move is", newmove)

        action = newmove

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives a reward. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning == True:
            #lastStateActionValue = lastStateActionValue + (self.alpha * (reward + maxQ - lastStateActionValue))
            #actionsForLastState[lastAction] = lastStateActionValue
            #self.Q[lastState] = actionsForLastState
            print("learnig in state:", state)
            print("learning in action:", action)
            print("self.Q[state][action]", self.Q[state][action])
            currentQ = self.Q[state][action]
            self.Q[state][action] = reward * self.alpha + currentQ * (
                1 - self.alpha)
            print("NEW self.Q[state][action]", self.Q[state][action])
            #self.Q[self.prev_state][self.prev_action] = ((1 - alpha)* self.Q[self.prev_state][self.prev_action]) + alpha * (self.prev_reward + gamma *maxQ)
        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn
        print("Finished Learning....")
        return
예제 #60
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env_):
        super(LearningAgent, self).__init__(
            env_
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.state = {}
        self.actions = [None, 'forward', 'left', 'right']
        self.state_roots = [
            'light', 'next_waypoint', 'right', 'left', 'oncoming'
        ]
        #self.state_roots = ['light', 'next_waypoint']
        self.Qtable = {}
        #tuning variables
        self.gamma = 0.1  # discount factor
        self.alpha = 0.9  # learning rate
        self.epsilon = 10  # exploration rate (select random action every x iteration)
        self.overall_simulations = 0
        self.overall_iterations = 0
        self.total_sucess = 0

    def reset(self, destination=None):
        self.overall_simulations += 1
        self.planner.route_to(destination)
        print "\n-----starting new simulation-----"
        self.print_Qtable()
        print "overall_iterations: ", self.overall_iterations
        print "overall_simulations: ", self.overall_simulations
        print "total_sucess: ", self.total_sucess
        print "self.gamma: ", self.gamma
        if self.total_sucess == 0:
            print "sucess_rate: 0%"
        else:
            sucess_rate = float(self.total_sucess) / float(
                self.overall_simulations)
            print "sucess_rate: ", sucess_rate * 100, "%"

    def add_new_state_to_Qtable(self, new_state_):
        #print "ADDING NEW STATE: ", new_state_
        action_state_init = 2
        init_actions = [(None, action_state_init),
                        ('forward', action_state_init),
                        ('left', action_state_init),
                        ('right', action_state_init)]
        state_tuple = {new_state_: init_actions}
        self.Qtable.update(state_tuple)
        return init_actions

    def print_Qtable(self):
        print "self.Qtable:"
        for row in self.Qtable:
            print "\t", row, self.Qtable[row]

    def get_action_from_Qtable(self, state_):
        state_tuple = self.Qtable.get(str(state_))
        new_action_tuple = None

        if state_tuple is None:
            # if the state is not in Q-table create it and choose random action
            state_actions = self.add_new_state_to_Qtable(state_)
            new_action_tuple = random.choice(state_actions)
            #print "new_random_action: ", new_action_tuple
        else:
            # state is in Q_table so get the maximum
            #print "found_existing_Qtable_state: ", state_tuple

            # if all items have the same Q_value, just return a random one
            Q_values = map(lambda state: state[1], state_tuple)
            #print "Q_values:", Q_values
            are_the_same = all(x == Q_values[0] for x in Q_values)
            #print "are_the_same:", are_the_same
            if are_the_same:
                return state_tuple[random.choice([0, 1, 2, 3])]

            #check epsilon greedy
            if ((self.env.t != 0) and ((self.env.t % self.epsilon) == 0)):
                return state_tuple[random.choice([0, 1, 2, 3])]

            max_value = -sys.maxint - 1
            #print "state_tuple:", state_tuple

            for index in range(len(state_tuple)):
                if state_tuple[index][1] > max_value:
                    max_state_index = index
                    max_value = state_tuple[index][1]

            new_action_tuple = state_tuple[max_state_index]
        return new_action_tuple

    def update(self, t):
        self.overall_iterations += 1
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        inputs.update({'next_waypoint': self.next_waypoint})
        inputs.update({'deadline': deadline})

        # Update state
        #print "\n\ninputs:", inputs
        # get only the monitoring states and save them to self.state
        map(lambda state: self.state.update({state: inputs[str(state)]}),
            self.state_roots)

        #self.print_Qtable()

        # 1) ----- get new state from Q-table (according to your policy)
        state_string = str(self.state)
        #print "state_string:", state_string
        new_action_tuple = self.get_action_from_Qtable(state_string)

        # 2) ----- Execute action and get reward
        action = new_action_tuple[0]
        #action = random.choice(self.actions)
        #action = self.next_waypoint
        #print "new_action_tuple:", new_action_tuple
        reward = self.env.act(self, action)
        #print "action REWARD: ", reward

        # 3) ----- Learn policy based on state, action, reward
        new_state = {}
        new_inputs = self.env.sense(self)
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        new_inputs.update({'next_waypoint': self.next_waypoint})
        new_inputs.update({'deadline': self.env.get_deadline(self)})

        map(lambda state: new_state.update({state: new_inputs[str(state)]}),
            self.state_roots)
        #print "new_state:", new_state
        max_action_tuple = self.get_action_from_Qtable(str(new_state))

        Q_hat = (1 - self.alpha) * new_action_tuple[1] + self.alpha * (
            reward + self.gamma * max_action_tuple[1])

        #print "Q_hat:", Q_hat
        state_tuples = self.Qtable[state_string]
        # store new/updated value to Qtable
        updated_state_tuples = state_tuples
        i = 0
        for s in state_tuples:
            if s[0] == action: state_tuples[i] = (action, Q_hat)
            i += 1

        #print "updated_state_tuples:", updated_state_tuples
        self.Qtable[state_string] = updated_state_tuples

        if self.env.trial_data['success'] == 1:
            self.total_sucess += 1
            self.epsilon += 1  #decay e-greedy implementation
            #for every 10 correct predictions we decay the exploration rate
            if (self.total_sucess > 1 and ((self.total_sucess % 10) == 0)
                    and (self.gamma < 1)):
                self.gamma += 0.1